update libaom to rev 0ec86ac7ae1e32a7e70410fa4972a655ec3670a4 (without moz.build and aom_ports/aom_once.h)

2026-05-26 15:02:46 +00:00 · 2019-02-01 21:11:22 +08:00
parent 3abe807b64
commit b246b0a6ee
92 changed files with 3631 additions and 3728 deletions
@@ -10,4 +10,4 @@ The upstream aom git repository is:

    https://aomedia.googlesource.com/aom

-The git commit ID used was b46542180d551d5e4eb666cf35dd62395ba43f3e.
+The git commit ID used was 0ec86ac7ae1e32a7e70410fa4972a655ec3670a4.
@@ -22,7 +22,7 @@ CONFIG_AV1_ENCODER equ 0
 CONFIG_BIG_ENDIAN equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
-CONFIG_COLLECT_INTER_MODE_RD_STATS equ 0
+CONFIG_COLLECT_PARTITION_STATS equ 0
 CONFIG_COLLECT_RD_STATS equ 0
 CONFIG_DEBUG equ 0
 CONFIG_DENOISE equ 1
@@ -30,7 +30,6 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_FILEOPTIONS equ 1
-CONFIG_FIX_GF_LENGTH equ 1
 CONFIG_FP_MB_STATS equ 0
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
@@ -24,7 +24,7 @@
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+#define CONFIG_COLLECT_PARTITION_STATS 0
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
 #define CONFIG_DENOISE 1
@@ -32,7 +32,6 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
-#define CONFIG_FIX_GF_LENGTH 1
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
@@ -22,7 +22,7 @@
 .equ CONFIG_BIG_ENDIAN, 0
 .equ CONFIG_BITSTREAM_DEBUG, 0
 .equ CONFIG_COEFFICIENT_RANGE_CHECKING, 0
-.equ CONFIG_COLLECT_INTER_MODE_RD_STATS, 0
+.equ CONFIG_COLLECT_PARTITION_STATS, 0
 .equ CONFIG_COLLECT_RD_STATS, 0
 .equ CONFIG_DEBUG, 0
 .equ CONFIG_DENOISE, 1
@@ -30,7 +30,6 @@
 .equ CONFIG_DIST_8X8, 0
 .equ CONFIG_ENTROPY_STATS, 0
 .equ CONFIG_FILEOPTIONS, 1
-.equ CONFIG_FIX_GF_LENGTH, 1
 .equ CONFIG_FP_MB_STATS, 0
 .equ CONFIG_GCC, 1
 .equ CONFIG_GCOV, 0
@@ -24,7 +24,7 @@
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+#define CONFIG_COLLECT_PARTITION_STATS 0
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
 #define CONFIG_DENOISE 1
@@ -32,7 +32,6 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
-#define CONFIG_FIX_GF_LENGTH 1
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
@@ -22,7 +22,7 @@ CONFIG_AV1_ENCODER equ 0
 CONFIG_BIG_ENDIAN equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
-CONFIG_COLLECT_INTER_MODE_RD_STATS equ 0
+CONFIG_COLLECT_PARTITION_STATS equ 0
 CONFIG_COLLECT_RD_STATS equ 0
 CONFIG_DEBUG equ 0
 CONFIG_DENOISE equ 1
@@ -30,7 +30,6 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_FILEOPTIONS equ 1
-CONFIG_FIX_GF_LENGTH equ 1
 CONFIG_FP_MB_STATS equ 0
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
@@ -24,7 +24,7 @@
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+#define CONFIG_COLLECT_PARTITION_STATS 0
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
 #define CONFIG_DENOISE 1
@@ -32,7 +32,6 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
-#define CONFIG_FIX_GF_LENGTH 1
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
@@ -22,7 +22,7 @@ CONFIG_AV1_ENCODER equ 0
 CONFIG_BIG_ENDIAN equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
-CONFIG_COLLECT_INTER_MODE_RD_STATS equ 0
+CONFIG_COLLECT_PARTITION_STATS equ 0
 CONFIG_COLLECT_RD_STATS equ 0
 CONFIG_DEBUG equ 0
 CONFIG_DENOISE equ 1
@@ -30,7 +30,6 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_FILEOPTIONS equ 1
-CONFIG_FIX_GF_LENGTH equ 1
 CONFIG_FP_MB_STATS equ 0
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
@@ -24,7 +24,7 @@
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+#define CONFIG_COLLECT_PARTITION_STATS 0
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
 #define CONFIG_DENOISE 1
@@ -32,7 +32,6 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
-#define CONFIG_FIX_GF_LENGTH 1
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
@@ -22,7 +22,7 @@ CONFIG_AV1_ENCODER equ 0
 CONFIG_BIG_ENDIAN equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
-CONFIG_COLLECT_INTER_MODE_RD_STATS equ 0
+CONFIG_COLLECT_PARTITION_STATS equ 0
 CONFIG_COLLECT_RD_STATS equ 0
 CONFIG_DEBUG equ 0
 CONFIG_DENOISE equ 1
@@ -30,7 +30,6 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_FILEOPTIONS equ 1
-CONFIG_FIX_GF_LENGTH equ 1
 CONFIG_FP_MB_STATS equ 0
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
@@ -24,7 +24,7 @@
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+#define CONFIG_COLLECT_PARTITION_STATS 0
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
 #define CONFIG_DENOISE 1
@@ -32,7 +32,6 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
-#define CONFIG_FIX_GF_LENGTH 1
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
@@ -22,7 +22,7 @@ CONFIG_AV1_ENCODER equ 0
 CONFIG_BIG_ENDIAN equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
-CONFIG_COLLECT_INTER_MODE_RD_STATS equ 0
+CONFIG_COLLECT_PARTITION_STATS equ 0
 CONFIG_COLLECT_RD_STATS equ 0
 CONFIG_DEBUG equ 0
 CONFIG_DENOISE equ 1
@@ -30,7 +30,6 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_FILEOPTIONS equ 1
-CONFIG_FIX_GF_LENGTH equ 1
 CONFIG_FP_MB_STATS equ 0
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
@@ -24,7 +24,7 @@
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+#define CONFIG_COLLECT_PARTITION_STATS 0
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
 #define CONFIG_DENOISE 1
@@ -32,7 +32,6 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
-#define CONFIG_FIX_GF_LENGTH 1
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
@@ -22,7 +22,7 @@ CONFIG_AV1_ENCODER equ 0
 CONFIG_BIG_ENDIAN equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
-CONFIG_COLLECT_INTER_MODE_RD_STATS equ 0
+CONFIG_COLLECT_PARTITION_STATS equ 0
 CONFIG_COLLECT_RD_STATS equ 0
 CONFIG_DEBUG equ 0
 CONFIG_DENOISE equ 1
@@ -30,7 +30,6 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_FILEOPTIONS equ 1
-CONFIG_FIX_GF_LENGTH equ 1
 CONFIG_FP_MB_STATS equ 0
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
@@ -24,7 +24,7 @@
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+#define CONFIG_COLLECT_PARTITION_STATS 0
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
 #define CONFIG_DENOISE 1
@@ -32,7 +32,6 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
-#define CONFIG_FIX_GF_LENGTH 1
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
@@ -22,7 +22,7 @@ CONFIG_AV1_ENCODER equ 0
 CONFIG_BIG_ENDIAN equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
-CONFIG_COLLECT_INTER_MODE_RD_STATS equ 0
+CONFIG_COLLECT_PARTITION_STATS equ 0
 CONFIG_COLLECT_RD_STATS equ 0
 CONFIG_DEBUG equ 0
 CONFIG_DENOISE equ 1
@@ -30,7 +30,6 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_FILEOPTIONS equ 1
-CONFIG_FIX_GF_LENGTH equ 1
 CONFIG_FP_MB_STATS equ 0
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
@@ -24,7 +24,7 @@
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+#define CONFIG_COLLECT_PARTITION_STATS 0
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
 #define CONFIG_DENOISE 1
@@ -32,7 +32,6 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
-#define CONFIG_FIX_GF_LENGTH 1
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
@@ -22,7 +22,7 @@ CONFIG_AV1_ENCODER equ 0
 CONFIG_BIG_ENDIAN equ 0
 CONFIG_BITSTREAM_DEBUG equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
-CONFIG_COLLECT_INTER_MODE_RD_STATS equ 0
+CONFIG_COLLECT_PARTITION_STATS equ 0
 CONFIG_COLLECT_RD_STATS equ 0
 CONFIG_DEBUG equ 0
 CONFIG_DENOISE equ 1
@@ -30,7 +30,6 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
 CONFIG_DIST_8X8 equ 0
 CONFIG_ENTROPY_STATS equ 0
 CONFIG_FILEOPTIONS equ 1
-CONFIG_FIX_GF_LENGTH equ 1
 CONFIG_FP_MB_STATS equ 0
 CONFIG_GCC equ 1
 CONFIG_GCOV equ 0
@@ -24,7 +24,7 @@
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_BITSTREAM_DEBUG 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+#define CONFIG_COLLECT_PARTITION_STATS 0
 #define CONFIG_COLLECT_RD_STATS 0
 #define CONFIG_DEBUG 0
 #define CONFIG_DENOISE 1
@@ -32,7 +32,6 @@
 #define CONFIG_DIST_8X8 0
 #define CONFIG_ENTROPY_STATS 0
 #define CONFIG_FILEOPTIONS 1
-#define CONFIG_FIX_GF_LENGTH 1
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_GCC 1
 #define CONFIG_GCOV 0
@@ -973,9 +973,22 @@ enum aome_enc_control_id {
  /*!\brief Control to use a reduced tx type set */
  AV1E_SET_REDUCED_TX_TYPE_SET,

+  /*!\brief Control to use dct only for intra modes */
+  AV1E_SET_INTRA_DCT_ONLY,
+
+  /*!\brief Control to use dct only for inter modes */
+  AV1E_SET_INTER_DCT_ONLY,
+
+  /*!\brief Control to use adaptive quantize_b */
+  AV1E_SET_QUANT_B_ADAPT,
+
  /*!\brief Control to select maximum height for the GF group pyramid structure
   * (valid values: 1 - 4) */
  AV1E_SET_GF_MAX_PYRAMID_HEIGHT,
+
+  /*!\brief Control to select maximum reference frames allowed per frame
+   * (valid values: 3 - 7) */
+  AV1E_SET_MAX_REFERENCE_FRAMES,
 };

 /*!\brief aom 1-D scaling mode
@@ -1350,9 +1363,21 @@ AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SUBSAMPLING_Y, unsigned int)
 AOM_CTRL_USE_TYPE(AV1E_SET_REDUCED_TX_TYPE_SET, unsigned int)
 #define AOM_CTRL_AV1E_SET_REDUCED_TX_TYPE_SET

+AOM_CTRL_USE_TYPE(AV1E_SET_INTRA_DCT_ONLY, unsigned int)
+#define AOM_CTRL_AV1E_SET_INTRA_DCT_ONLY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_INTER_DCT_ONLY, unsigned int)
+#define AOM_CTRL_AV1E_SET_INTER_DCT_ONLY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QUANT_B_ADAPT, unsigned int)
+#define AOM_CTRL_AV1E_SET_QUANT_B_ADAPT
+
 AOM_CTRL_USE_TYPE(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, unsigned int)
 #define AOM_CTRL_AV1E_SET_GF_MAX_PYRAMID_HEIGHT

+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_REFERENCE_FRAMES, unsigned int)
+#define AOM_CTRL_AV1E_SET_MAX_REFERENCE_FRAMES
+
 /*!\endcond */
 /*! @} - end defgroup aom_encoder */
 #ifdef __cplusplus
@@ -226,6 +226,7 @@ if(CONFIG_AV1_ENCODER)
              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
              "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h"
              "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3.c"
              "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c"
              "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c"
              "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c")
@@ -522,6 +522,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
  specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";

  add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_quantize_b_64x64 ssse3/;
 }  # CONFIG_AV1_ENCODER

 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
@@ -12,6 +12,68 @@
 #include "aom_dsp/quantize.h"
 #include "aom_mem/aom_mem.h"

+void quantize_b_adaptive_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int coeff = coeff_ptr[rc] * wt;
+
+    int prescan_add = ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * 325, 7);
+    if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add) &&
+        coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add))
+      non_zero_count--;
+    else
+      break;
+  }
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    int tmp32;
+
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+      int64_t tmp =
+          clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+                INT16_MIN, INT16_MAX);
+      tmp *= wt;
+      tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                     quant_shift_ptr[rc != 0]) >>
+                    (16 - log_scale + AOM_QM_BITS));  // quantization
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+      const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+      dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+
+      if (tmp32) eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
 void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                         const int16_t *zbin_ptr, const int16_t *round_ptr,
                         const int16_t *quant_ptr,
@@ -74,6 +136,64 @@ void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
  *eob_ptr = eob + 1;
 }

+void highbd_quantize_b_adaptive_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale) {
+  int i, eob = -1;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  int dequant;
+  int idx_arr[4096];
+  (void)iscan;
+  int idx = 0;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int coeff = coeff_ptr[rc] * wt;
+
+    // If the coefficient is out of the base ZBIN range, keep it for
+    // quantization.
+    int prescan_add = ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * 325, 7);
+    if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add) &&
+        coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add))
+      idx_arr[idx++] = i;
+  }
+
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = scan[idx_arr[i]];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 =
+        abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+    const int64_t tmpw = tmp1 * wt;
+    const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+    const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
+                                 (16 - log_scale + AOM_QM_BITS));
+    qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dequant =
+        (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+    const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+    dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+    if (abs_qcoeff) eob = idx_arr[i];
+  }
+  *eob_ptr = eob + 1;
+}
+
 void highbd_quantize_b_helper_c(
    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
    const int16_t *round_ptr, const int16_t *quant_ptr,
@@ -133,6 +253,80 @@ void highbd_quantize_b_helper_c(

 /* These functions should only be called when quantisation matrices
   are not used. */
+void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) {
+  quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                               quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                               dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                               NULL, NULL, 0);
+}
+
+void aom_quantize_b_32x32_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                               quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                               dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                               NULL, NULL, 1);
+}
+
+void aom_quantize_b_64x64_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                               quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                               dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                               NULL, NULL, 2);
+}
+
+void aom_highbd_quantize_b_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                      quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                      dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                      iscan, NULL, NULL, 0);
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                      quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                      dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                      iscan, NULL, NULL, 1);
+}
+
+void aom_highbd_quantize_b_64x64_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                      quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                      dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                      iscan, NULL, NULL, 2);
+}
+
 void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                      const int16_t *zbin_ptr, const int16_t *round_ptr,
                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
@@ -20,6 +20,66 @@
 extern "C" {
 #endif

+void quantize_b_adaptive_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale);
+
+void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan);
+
+void aom_quantize_b_32x32_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+
+void aom_quantize_b_64x64_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+
+void highbd_quantize_b_adaptive_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale);
+
+void aom_highbd_quantize_b_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+
+void aom_highbd_quantize_b_32x32_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+
+void aom_highbd_quantize_b_64x64_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+
 void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                         const int16_t *zbin_ptr, const int16_t *round_ptr,
                         const int16_t *quant_ptr,
@@ -18,28 +18,6 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/quantize_x86.h"

-static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
-  assert(sizeof(tran_low_t) == 4);
-
-  return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
-                        (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
-                        (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
-                        (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
-}
-
-static INLINE void store_coefficients(__m128i coeff_vals,
-                                      tran_low_t *coeff_ptr) {
-  assert(sizeof(tran_low_t) == 4);
-
-  __m128i one = _mm_set1_epi16(1);
-  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
-  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
-  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
-  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
-  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
-  _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
-}
-
 void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                         const int16_t *zbin_ptr, const int16_t *round_ptr,
                         const int16_t *quant_ptr,
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void calculate_qcoeff_64x64(__m128i *coeff, const __m128i round,
+                                          const __m128i quant,
+                                          const __m128i *shift) {
+  __m128i tmp, qcoeff, tmp1;
+  qcoeff = _mm_adds_epi16(*coeff, round);
+  tmp = _mm_mulhi_epi16(qcoeff, quant);
+  qcoeff = _mm_add_epi16(tmp, qcoeff);
+  tmp = _mm_mullo_epi16(qcoeff, *shift);
+  tmp = _mm_srli_epi16(tmp, 14);
+  tmp1 = _mm_mulhi_epi16(qcoeff, *shift);
+  tmp1 = _mm_slli_epi16(tmp1, 2);
+  *coeff = _mm_or_si128(tmp, tmp1);
+}
+
+static INLINE void calculate_dqcoeff_and_store_64x64(const __m128i qcoeff,
+                                                     const __m128i dequant,
+                                                     const __m128i zero,
+                                                     tran_low_t *dqcoeff) {
+  // Un-sign to bias rounding like C.
+  const __m128i coeff = _mm_abs_epi16(qcoeff);
+
+  const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff);
+  const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff);
+
+  const __m128i low = _mm_mullo_epi16(coeff, dequant);
+  const __m128i high = _mm_mulhi_epi16(coeff, dequant);
+  __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+  __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+  // "Divide" by 4.
+  dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 2);
+  dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 2);
+
+  dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0);
+  dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1);
+
+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+  _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
+void aom_quantize_b_64x64_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i two = _mm_set1_epi16(2);
+  int index;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1, all_zero;
+  __m128i eob = zero, eob0;
+
+  (void)scan;
+  (void)n_coeffs;
+
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, two);
+  round = _mm_add_epi16(round, two);
+  zbin = _mm_srli_epi16(zbin, 2);
+  round = _mm_srli_epi16(round, 2);
+  zbin = _mm_sub_epi16(zbin, one);
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift);
+
+    // Reinsert signs.
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs.
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr);
+    store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, dqcoeff_ptr);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
+
+    eob =
+        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < 1024; index += 16) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+      continue;
+    }
+    calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift);
+    calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero,
+                                      dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero,
+                                      dqcoeff_ptr + 8 + index);
+
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
@@ -75,3 +75,23 @@ static INLINE int16_t accumulate_eob(__m128i eob) {
  eob = _mm_max_epi16(eob, eob_shuffled);
  return _mm_extract_epi16(eob, 1);
 }
+
+static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
+  assert(sizeof(tran_low_t) == 4);
+  const __m128i coeff1 = _mm_load_si128((__m128i *)(coeff_ptr));
+  const __m128i coeff2 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+  return _mm_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void store_coefficients(__m128i coeff_vals,
+                                      tran_low_t *coeff_ptr) {
+  assert(sizeof(tran_low_t) == 4);
+
+  __m128i one = _mm_set1_epi16(1);
+  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
+  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
+  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
+  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
+  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
+  _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
+}
@@ -167,35 +167,7 @@ static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
  out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31);
  out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31);
 }
-static INLINE void transpose_16bit_16x8_avx2(const __m256i *const in,
-                                             __m256i *const out) {
-  const __m256i a0 = _mm256_unpacklo_epi16(in[0], in[1]);
-  const __m256i a1 = _mm256_unpacklo_epi16(in[2], in[3]);
-  const __m256i a2 = _mm256_unpacklo_epi16(in[4], in[5]);
-  const __m256i a3 = _mm256_unpacklo_epi16(in[6], in[7]);
-  const __m256i a4 = _mm256_unpackhi_epi16(in[0], in[1]);
-  const __m256i a5 = _mm256_unpackhi_epi16(in[2], in[3]);
-  const __m256i a6 = _mm256_unpackhi_epi16(in[4], in[5]);
-  const __m256i a7 = _mm256_unpackhi_epi16(in[6], in[7]);

-  const __m256i b0 = _mm256_unpacklo_epi32(a0, a1);
-  const __m256i b1 = _mm256_unpacklo_epi32(a2, a3);
-  const __m256i b2 = _mm256_unpacklo_epi32(a4, a5);
-  const __m256i b3 = _mm256_unpacklo_epi32(a6, a7);
-  const __m256i b4 = _mm256_unpackhi_epi32(a0, a1);
-  const __m256i b5 = _mm256_unpackhi_epi32(a2, a3);
-  const __m256i b6 = _mm256_unpackhi_epi32(a4, a5);
-  const __m256i b7 = _mm256_unpackhi_epi32(a6, a7);
-
-  out[0] = _mm256_unpacklo_epi64(b0, b1);
-  out[1] = _mm256_unpackhi_epi64(b0, b1);
-  out[2] = _mm256_unpacklo_epi64(b4, b5);
-  out[3] = _mm256_unpackhi_epi64(b4, b5);
-  out[4] = _mm256_unpacklo_epi64(b2, b3);
-  out[5] = _mm256_unpackhi_epi64(b2, b3);
-  out[6] = _mm256_unpacklo_epi64(b6, b7);
-  out[7] = _mm256_unpackhi_epi64(b6, b7);
-}
 static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
  for (int i = 0; i < size; ++i) {
    out[size - i - 1] = in[i];
@@ -263,61 +235,7 @@ static INLINE void av1_round_shift_rect_array_32_avx2(__m256i *input,
    }
  }
 }
-static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) {
-  const __m256i scale_rounding =
-      pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1));
-  const __m256i b = _mm256_madd_epi16(a, scale_rounding);
-  return _mm256_srai_epi32(b, NewSqrt2Bits);
-}
-static INLINE void store_rect_16bit_to_32bit_w8_avx2(const __m256i a,
-                                                     int32_t *const b) {
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i a_lo = _mm256_unpacklo_epi16(a, one);
-  const __m256i a_hi = _mm256_unpackhi_epi16(a, one);
-  const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
-  const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
-  const __m256i temp = _mm256_permute2f128_si256(b_lo, b_hi, 0x31);
-  _mm_store_si128((__m128i *)b, _mm256_castsi256_si128(b_lo));
-  _mm_store_si128((__m128i *)(b + 4), _mm256_castsi256_si128(b_hi));
-  _mm256_store_si256((__m256i *)(b + 64), temp);
-}
-static INLINE void store_rect_buffer_16bit_to_32bit_w8_avx2(
-    const __m256i *const in, int32_t *const out, const int stride,
-    const int out_size) {
-  for (int i = 0; i < out_size; ++i) {
-    store_rect_16bit_to_32bit_w8_avx2(in[i], out + i * stride);
-  }
-}
-static INLINE void pack_reg(const __m128i *in1, const __m128i *in2,
-                            __m256i *out) {
-  out[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[0]), in2[0], 0x1);
-  out[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[1]), in2[1], 0x1);
-  out[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[2]), in2[2], 0x1);
-  out[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[3]), in2[3], 0x1);
-  out[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[4]), in2[4], 0x1);
-  out[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[5]), in2[5], 0x1);
-  out[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[6]), in2[6], 0x1);
-  out[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[7]), in2[7], 0x1);
-}
-static INLINE void extract_reg(const __m256i *in, __m128i *out1) {
-  out1[0] = _mm256_castsi256_si128(in[0]);
-  out1[1] = _mm256_castsi256_si128(in[1]);
-  out1[2] = _mm256_castsi256_si128(in[2]);
-  out1[3] = _mm256_castsi256_si128(in[3]);
-  out1[4] = _mm256_castsi256_si128(in[4]);
-  out1[5] = _mm256_castsi256_si128(in[5]);
-  out1[6] = _mm256_castsi256_si128(in[6]);
-  out1[7] = _mm256_castsi256_si128(in[7]);

-  out1[8] = _mm256_extractf128_si256(in[0], 0x01);
-  out1[9] = _mm256_extractf128_si256(in[1], 0x01);
-  out1[10] = _mm256_extractf128_si256(in[2], 0x01);
-  out1[11] = _mm256_extractf128_si256(in[3], 0x01);
-  out1[12] = _mm256_extractf128_si256(in[4], 0x01);
-  out1[13] = _mm256_extractf128_si256(in[5], 0x01);
-  out1[14] = _mm256_extractf128_si256(in[6], 0x01);
-  out1[15] = _mm256_extractf128_si256(in[7], 0x01);
-}
 #ifdef __cplusplus
 }
 #endif
@@ -26,7 +26,7 @@ extern "C" {
 #define AOM_INTERP_EXTEND 4
 #define AOM_BORDER_IN_PIXELS 288
 #define AOM_ENC_NO_SCALE_BORDER 160
-#define AOM_DEC_BORDER_IN_PIXELS 288
+#define AOM_DEC_BORDER_IN_PIXELS 64

 typedef struct yv12_buffer_config {
  union {
@@ -484,6 +484,7 @@ static int main_loop(int argc, const char **argv_) {
  input.webm_ctx = &webm_ctx;
 #endif
  struct ObuDecInputContext obu_ctx = { NULL, NULL, 0, 0, 0 };
+  int is_ivf = 0;

  obu_ctx.avx_ctx = &aom_input_ctx;
  input.obu_ctx = &obu_ctx;
@@ -610,8 +611,10 @@ static int main_loop(int argc, const char **argv_) {
 #endif
  input.aom_input_ctx->filename = fn;
  input.aom_input_ctx->file = infile;
-  if (file_is_ivf(input.aom_input_ctx))
+  if (file_is_ivf(input.aom_input_ctx)) {
    input.aom_input_ctx->file_type = FILE_TYPE_IVF;
+    is_ivf = 1;
+  }
 #if CONFIG_WEBM_IO
  else if (file_is_webm(input.webm_ctx, input.aom_input_ctx))
    input.aom_input_ctx->file_type = FILE_TYPE_WEBM;
@@ -661,6 +664,10 @@ static int main_loop(int argc, const char **argv_) {
  }

  fourcc_interface = get_aom_decoder_by_fourcc(aom_input_ctx.fourcc);
+
+  if (is_ivf && !fourcc_interface)
+    fatal("Unsupported fourcc: %x\n", aom_input_ctx.fourcc);
+
  if (interface && fourcc_interface && interface != fourcc_interface)
    warn("Header indicates codec: %s\n", fourcc_interface->name);
  else
@@ -263,9 +263,9 @@ static const arg_def_t global_error_resilient =
            "Enable global error resiliency features");
 static const arg_def_t lag_in_frames =
    ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag");
-static const arg_def_t large_scale_tile =
-    ARG_DEF(NULL, "large-scale-tile", 1,
-            "Large scale tile coding (0: off (default), 1: on)");
+static const arg_def_t large_scale_tile = ARG_DEF(
+    NULL, "large-scale-tile", 1,
+    "Large scale tile coding (0: off (default), 1: on (ivf output only))");
 static const arg_def_t monochrome =
    ARG_DEF(NULL, "monochrome", 0, "Monochrome video (no chroma planes)");
 static const arg_def_t full_still_picture_hdr = ARG_DEF(
@@ -532,6 +532,12 @@ static const arg_def_t qm_max = ARG_DEF(
    NULL, "qm-max", 1, "Max quant matrix flatness (0..15), default is 15");
 static const arg_def_t reduced_tx_type_set = ARG_DEF(
    NULL, "reduced-tx-type-set", 1, "Use reduced set of transform types");
+static const arg_def_t use_intra_dct_only =
+    ARG_DEF(NULL, "use-intra-dct-only", 1, "Use DCT only for INTRA modes");
+static const arg_def_t use_inter_dct_only =
+    ARG_DEF(NULL, "use-inter-dct-only", 1, "Use DCT only for INTER modes");
+static const arg_def_t quant_b_adapt =
+    ARG_DEF(NULL, "quant-b-adapt", 1, "Use adaptive quantize_b");
 #if CONFIG_DIST_8X8
 static const arg_def_t enable_dist_8x8 =
    ARG_DEF(NULL, "enable-dist-8x8", 1,
@@ -602,6 +608,9 @@ static const arg_def_t max_gf_interval = ARG_DEF(
 static const arg_def_t gf_max_pyr_height =
    ARG_DEF(NULL, "gf-max-pyr-height", 1,
            "maximum height for GF group pyramid structure (1 to 4 (default))");
+static const arg_def_t max_reference_frames = ARG_DEF(
+    NULL, "max-reference-frames", 1,
+    "maximum number of reference frames allowed per frame (3 to 7 (default))");

 static const struct arg_enum_list color_primaries_enum[] = {
  { "bt709", AOM_CICP_CP_BT_709 },
@@ -752,6 +761,9 @@ static const arg_def_t *av1_args[] = { &cpu_used_av1,
                                       &qm_min,
                                       &qm_max,
                                       &reduced_tx_type_set,
+                                       &use_intra_dct_only,
+                                       &use_inter_dct_only,
+                                       &quant_b_adapt,
 #if CONFIG_DIST_8X8
                                       &enable_dist_8x8,
 #endif
@@ -779,7 +791,8 @@ static const arg_def_t *av1_args[] = { &cpu_used_av1,
 #if CONFIG_DENOISE
                                       &denoise_noise_level,
                                       &denoise_block_size,
-#endif
+#endif  // CONFIG_DENOISE
+                                       &max_reference_frames,
                                       &enable_ref_frame_mvs,
                                       &bitdeptharg,
                                       &inbitdeptharg,
@@ -834,6 +847,9 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
                                        AV1E_SET_QM_MIN,
                                        AV1E_SET_QM_MAX,
                                        AV1E_SET_REDUCED_TX_TYPE_SET,
+                                        AV1E_SET_INTRA_DCT_ONLY,
+                                        AV1E_SET_INTER_DCT_ONLY,
+                                        AV1E_SET_QUANT_B_ADAPT,
 #if CONFIG_DIST_8X8
                                        AV1E_SET_ENABLE_DIST_8X8,
 #endif
@@ -861,7 +877,8 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
 #if CONFIG_DENOISE
                                        AV1E_SET_DENOISE_NOISE_LEVEL,
                                        AV1E_SET_DENOISE_BLOCK_SIZE,
-#endif
+#endif  // CONFIG_DENOISE
+                                        AV1E_SET_MAX_REFERENCE_FRAMES,
                                        AV1E_SET_ENABLE_REF_FRAME_MVS,
                                        0 };
 #endif  // CONFIG_AV1_ENCODER
@@ -1340,6 +1357,7 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
      config->cfg.g_lag_in_frames = arg_parse_uint(&arg);
    } else if (arg_match(&arg, &large_scale_tile, argi)) {
      config->cfg.large_scale_tile = arg_parse_uint(&arg);
+      if (config->cfg.large_scale_tile) global->codec = get_aom_lst_encoder();
    } else if (arg_match(&arg, &monochrome, argi)) {
      config->cfg.monochrome = 1;
    } else if (arg_match(&arg, &full_still_picture_hdr, argi)) {
@@ -2097,6 +2115,10 @@ int main(int argc, const char **argv_) {
  FOREACH_STREAM(stream, streams) {
    check_encoder_config(global.disable_warning_prompt, &global,
                         &stream->config.cfg);
+
+    // If large_scale_tile = 1, only support to output to ivf format.
+    if (stream->config.cfg.large_scale_tile && !stream->config.write_ivf)
+      die("only support ivf output format while large-scale-tile=1\n");
  }

  /* Handle non-option arguments */
@@ -191,6 +191,8 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
            "${AOM_ROOT}/av1/encoder/temporal_filter.h"
            "${AOM_ROOT}/av1/encoder/tokenize.c"
            "${AOM_ROOT}/av1/encoder/tokenize.h"
+            "${AOM_ROOT}/av1/encoder/tpl_model.c"
+            "${AOM_ROOT}/av1/encoder/tpl_model.h"
            "${AOM_ROOT}/av1/encoder/wedge_utils.c"
            "${AOM_ROOT}/third_party/fastfeat/fast.c"
            "${AOM_ROOT}/third_party/fastfeat/fast.h"
@@ -96,6 +96,7 @@ struct av1_extracfg {
  int enable_order_hint;         // enable order hint for sequence
  int enable_tx64;               // enable 64-pt transform usage for sequence
  int enable_dist_wtd_comp;      // enable dist wtd compound for sequence
+  int max_reference_frames;      // maximum number of references per frame
  int enable_ref_frame_mvs;      // sequence level
  int allow_ref_frame_mvs;       // frame level
  int enable_masked_comp;        // enable masked compound for sequence
@@ -123,6 +124,9 @@ struct av1_extracfg {
  unsigned int chroma_subsampling_x;
  unsigned int chroma_subsampling_y;
  int reduced_tx_type_set;
+  int use_intra_dct_only;
+  int use_inter_dct_only;
+  int quant_b_adapt;
 };

 static struct av1_extracfg default_extra_cfg = {
@@ -190,6 +194,7 @@ static struct av1_extracfg default_extra_cfg = {
  1,                            // frame order hint
  1,                            // enable 64-pt transform usage
  1,                            // dist-wtd compound
+  7,                            // max_reference_frames
  1,                            // enable_ref_frame_mvs sequence level
  1,                            // allow ref_frame_mvs frame level
  1,                            // enable masked compound at sequence level
@@ -216,6 +221,9 @@ static struct av1_extracfg default_extra_cfg = {
  0,  // chroma_subsampling_x
  0,  // chroma_subsampling_y
  0,  // reduced_tx_type_set
+  0,  // use_intra_dct_only
+  0,  // use_inter_dct_only
+  0,  // quant_b_adapt
 };

 struct aom_codec_alg_priv {
@@ -419,6 +427,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
 #endif
  }

+  RANGE_CHECK(extra_cfg, max_reference_frames, 3, 7);
  RANGE_CHECK_HI(extra_cfg, chroma_subsampling_x, 1);
  RANGE_CHECK_HI(extra_cfg, chroma_subsampling_y, 1);

@@ -571,6 +580,9 @@ static aom_codec_err_t set_encoder_config(
  oxcf->qm_minlevel = extra_cfg->qm_min;
  oxcf->qm_maxlevel = extra_cfg->qm_max;
  oxcf->reduced_tx_type_set = extra_cfg->reduced_tx_type_set;
+  oxcf->use_intra_dct_only = extra_cfg->use_intra_dct_only;
+  oxcf->use_inter_dct_only = extra_cfg->use_inter_dct_only;
+  oxcf->quant_b_adapt = extra_cfg->quant_b_adapt;
 #if CONFIG_DIST_8X8
  oxcf->using_dist_8x8 = extra_cfg->enable_dist_8x8;
  if (extra_cfg->tuning == AOM_TUNE_CDEF_DIST ||
@@ -711,6 +723,11 @@ static aom_codec_err_t set_encoder_config(
  oxcf->enable_order_hint = extra_cfg->enable_order_hint;
  oxcf->enable_dist_wtd_comp =
      extra_cfg->enable_dist_wtd_comp & extra_cfg->enable_order_hint;
+  oxcf->max_reference_frames = extra_cfg->max_reference_frames;
+  if (oxcf->max_reference_frames > 3 && oxcf->max_reference_frames < 7) {
+    // TODO(urvang): Enable all possible values, after they work properly.
+    oxcf->max_reference_frames = 3;
+  }
  oxcf->enable_masked_comp = extra_cfg->enable_masked_comp;
  oxcf->enable_diff_wtd_comp =
      extra_cfg->enable_masked_comp & extra_cfg->enable_diff_wtd_comp;
@@ -1109,6 +1126,13 @@ static aom_codec_err_t ctrl_set_enable_dist_wtd_comp(aom_codec_alg_priv_t *ctx,
  return update_extra_cfg(ctx, &extra_cfg);
 }

+static aom_codec_err_t ctrl_set_max_reference_frames(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.max_reference_frames = CAST(AV1E_SET_MAX_REFERENCE_FRAMES, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_enable_ref_frame_mvs(aom_codec_alg_priv_t *ctx,
                                                     va_list args) {
  struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1289,6 +1313,27 @@ static aom_codec_err_t ctrl_set_reduced_tx_type_set(aom_codec_alg_priv_t *ctx,
  return update_extra_cfg(ctx, &extra_cfg);
 }

+static aom_codec_err_t ctrl_set_intra_dct_only(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.use_intra_dct_only = CAST(AV1E_SET_INTRA_DCT_ONLY, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_inter_dct_only(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.use_inter_dct_only = CAST(AV1E_SET_INTER_DCT_ONLY, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_quant_b_adapt(aom_codec_alg_priv_t *ctx,
+                                              va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.quant_b_adapt = CAST(AV1E_SET_QUANT_B_ADAPT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_film_grain_test_vector(
    aom_codec_alg_priv_t *ctx, va_list args) {
  struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1987,6 +2032,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
  { AV1E_SET_ENABLE_ORDER_HINT, ctrl_set_enable_order_hint },
  { AV1E_SET_ENABLE_TX64, ctrl_set_enable_tx64 },
  { AV1E_SET_ENABLE_DIST_WTD_COMP, ctrl_set_enable_dist_wtd_comp },
+  { AV1E_SET_MAX_REFERENCE_FRAMES, ctrl_set_max_reference_frames },
  { AV1E_SET_ENABLE_REF_FRAME_MVS, ctrl_set_enable_ref_frame_mvs },
  { AV1E_SET_ALLOW_REF_FRAME_MVS, ctrl_set_allow_ref_frame_mvs },
  { AV1E_SET_ENABLE_MASKED_COMP, ctrl_set_enable_masked_comp },
@@ -2008,6 +2054,9 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
  { AV1E_SET_ENABLE_ANGLE_DELTA, ctrl_set_enable_angle_delta },
  { AV1E_SET_AQ_MODE, ctrl_set_aq_mode },
  { AV1E_SET_REDUCED_TX_TYPE_SET, ctrl_set_reduced_tx_type_set },
+  { AV1E_SET_INTRA_DCT_ONLY, ctrl_set_intra_dct_only },
+  { AV1E_SET_INTER_DCT_ONLY, ctrl_set_inter_dct_only },
+  { AV1E_SET_QUANT_B_ADAPT, ctrl_set_quant_b_adapt },
  { AV1E_SET_DELTAQ_MODE, ctrl_set_deltaq_mode },
  { AV1E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost },
  { AV1E_SET_TUNE_CONTENT, ctrl_set_tune_content },
@@ -243,8 +243,10 @@ typedef struct MB_MODE_INFO {
  // Joint sign of alpha Cb and alpha Cr
  int cfl_alpha_signs;

-  int compound_idx;
+  // Indicate if masked compound is used(1) or not(0).
  int comp_group_idx;
+  // If comp_group_idx=0, indicate if dist_wtd_comp(0) or avg_comp(1) is used.
+  int compound_idx;
 #if CONFIG_INSPECTION
  int16_t tx_skip[TXK_TYPE_BUF_LEN];
 #endif
@@ -596,7 +598,7 @@ typedef struct macroblockd {
  uint8_t *tmp_obmc_bufs[2];
 } MACROBLOCKD;

-static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) {
+static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) {
  return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
 }

@@ -37,7 +37,7 @@ void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
  assert(pred_plane < CFL_PRED_PLANES);
  assert(width <= CFL_BUF_LINE);

-  if (get_bitdepth_data_path_index(xd)) {
+  if (is_cur_buf_hbd(xd)) {
    uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input);
    memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1);
    return;
@@ -69,7 +69,7 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
  assert(pred_plane < CFL_PRED_PLANES);
  assert(width <= CFL_BUF_LINE);
  assert(height <= CFL_BUF_LINE);
-  if (get_bitdepth_data_path_index(xd)) {
+  if (is_cur_buf_hbd(xd)) {
    uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
    cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride,
                         width, height);
@@ -196,7 +196,7 @@ void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
      cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
  assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <=
         CFL_BUF_SQUARE);
-  if (get_bitdepth_data_path_index(xd)) {
+  if (is_cur_buf_hbd(xd)) {
    uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
    get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride, alpha_q3,
                                xd->bd);
@@ -388,8 +388,7 @@ void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
    assert(!((row & 1) && tx_size_high[tx_size] != 4));
    sub8x8_adjust_offset(cfl, &row, &col);
  }
-  cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size,
-            get_bitdepth_data_path_index(xd));
+  cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, is_cur_buf_hbd(xd));
 }

 void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
@@ -405,5 +404,5 @@ void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
  const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);
  tx_size = get_tx_size(width, height);
  cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size,
-            get_bitdepth_data_path_index(xd));
+            is_cur_buf_hbd(xd));
 }
@@ -204,7 +204,7 @@ static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
  txfm_param->eob = eob;
  txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id];
  txfm_param->bd = xd->bd;
-  txfm_param->is_hbd = get_bitdepth_data_path_index(xd);
+  txfm_param->is_hbd = is_cur_buf_hbd(xd);
  txfm_param->tx_set_type = av1_get_ext_tx_set_type(
      txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
 }
@@ -378,7 +378,6 @@ typedef struct AV1Common {
  int show_frame;
  int showable_frame;  // frame can be used as show existing frame in future
  int show_existing_frame;
-  int reset_decoder_state;

  uint8_t disable_cdf_update;
  int allow_high_precision_mv;
@@ -432,6 +431,7 @@ typedef struct AV1Common {
  int qm_v;
  int min_qmlevel;
  int max_qmlevel;
+  int use_quant_b_adapt;

  /* We allocate a MB_MODE_INFO struct for each macroblock, together with
     an extra row on top and column on the left to simplify prediction. */
@@ -501,7 +501,6 @@ typedef struct AV1Common {
  int primary_ref_frame;

  int error_resilient_mode;
-  int force_primary_ref_none;

  int tile_cols, tile_rows;

@@ -642,6 +641,7 @@ static INLINE RefCntBuffer *assign_cur_frame_new_fb(AV1_COMMON *const cm) {
  if (new_fb_idx == INVALID_IDX) return NULL;

  cm->cur_frame = &cm->buffer_pool->frame_bufs[new_fb_idx];
+  cm->cur_frame->buf.buf_8bit_valid = 0;
  return cm->cur_frame;
 }

@@ -84,12 +84,11 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
  if (do_warp && xd->cur_frame_force_integer_mv == 0) {
    const struct macroblockd_plane *const pd = &xd->plane[plane];
    const struct buf_2d *const pre_buf = &pd->pre[ref];
-    av1_warp_plane(&final_warp_params,
-                   xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+    av1_warp_plane(&final_warp_params, is_cur_buf_hbd(xd), xd->bd,
                   pre_buf->buf0, pre_buf->width, pre_buf->height,
                   pre_buf->stride, dst, p_col, p_row, w, h, dst_stride,
                   pd->subsampling_x, pd->subsampling_y, conv_params);
-  } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  } else if (is_cur_buf_hbd(xd)) {
    highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf,
                           w, h, conv_params, interp_filters, is_intrabc,
                           xd->bd);
@@ -568,14 +567,15 @@ static void build_masked_compound_no_round(
  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+  if (is_cur_buf_hbd(xd)) {
    aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
                                  src1_stride, mask, block_size_wide[sb_type],
                                  w, h, subw, subh, conv_params, xd->bd);
-  else
+  } else {
    aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
                                 src1_stride, mask, block_size_wide[sb_type], w,
                                 h, subw, subh, conv_params);
+  }
 }

 void av1_make_masked_inter_predictor(
@@ -800,53 +800,6 @@ void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
  return;
 }

-struct obmc_check_mv_field_ctxt {
-  MB_MODE_INFO *current_mi;
-  int mv_field_check_result;
-};
-
-static INLINE void obmc_check_identical_mv(MACROBLOCKD *xd, int rel_mi_col,
-                                           uint8_t nb_mi_width,
-                                           MB_MODE_INFO *nb_mi, void *fun_ctxt,
-                                           const int num_planes) {
-  (void)xd;
-  (void)rel_mi_col;
-  (void)nb_mi_width;
-  (void)num_planes;
-  struct obmc_check_mv_field_ctxt *ctxt =
-      (struct obmc_check_mv_field_ctxt *)fun_ctxt;
-  const MB_MODE_INFO *current_mi = ctxt->current_mi;
-
-  if (ctxt->mv_field_check_result == 0) return;
-
-  if (nb_mi->ref_frame[0] != current_mi->ref_frame[0] ||
-      nb_mi->mv[0].as_int != current_mi->mv[0].as_int ||
-      nb_mi->interp_filters != current_mi->interp_filters) {
-    ctxt->mv_field_check_result = 0;
-  }
-  return;
-}
-
-// Check if the neighbors' motions used by obmc have same parameters as for
-// the current block. If all the parameters are identical, obmc will produce
-// the same prediction as from regular bmc, therefore we can skip the
-// overlapping operations for less complexity. The parameters checked include
-// reference frame, motion vector, and interpolation filter.
-int av1_check_identical_obmc_mv_field(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                      int mi_row, int mi_col) {
-  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-  struct obmc_check_mv_field_ctxt mv_field_check_ctxt = { xd->mi[0], 1 };
-
-  foreach_overlappable_nb_above(cm, xd, mi_col,
-                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
-                                obmc_check_identical_mv, &mv_field_check_ctxt);
-  foreach_overlappable_nb_left(cm, xd, mi_row,
-                               max_neighbor_obmc[mi_size_high_log2[bsize]],
-                               obmc_check_identical_mv, &mv_field_check_ctxt);
-
-  return mv_field_check_ctxt.mv_field_check_result;
-}
-
 struct obmc_inter_pred_ctxt {
  uint8_t **adjacent;
  int *adjacent_stride;
@@ -860,7 +813,7 @@ static INLINE void build_obmc_inter_pred_above(MACROBLOCKD *xd, int rel_mi_col,
  (void)above_mi;
  struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+  const int is_hbd = is_cur_buf_hbd(xd);
  const int overlap =
      AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;

@@ -897,7 +850,7 @@ static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row,
  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
  const int overlap =
      AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
-  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+  const int is_hbd = is_cur_buf_hbd(xd);

  for (int plane = 0; plane < num_planes; ++plane) {
    const struct macroblockd_plane *pd = &xd->plane[plane];
@@ -1142,8 +1095,8 @@ static void combine_interintra_highbd(
 void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
                                               MACROBLOCKD *xd,
                                               BLOCK_SIZE bsize, int plane,
-                                               BUFFER_SET *ctx, uint8_t *dst,
-                                               int dst_stride) {
+                                               const BUFFER_SET *ctx,
+                                               uint8_t *dst, int dst_stride) {
  struct macroblockd_plane *const pd = &xd->plane[plane];
  const int ssx = xd->plane[plane].subsampling_x;
  const int ssy = xd->plane[plane].subsampling_y;
@@ -1166,7 +1119,7 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
  const int ssx = xd->plane[plane].subsampling_x;
  const int ssy = xd->plane[plane].subsampling_y;
  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
    combine_interintra_highbd(
        xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
        xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign,
@@ -1185,9 +1138,9 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
 // build interintra_predictors for one plane
 void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                         uint8_t *pred, int stride,
-                                         BUFFER_SET *ctx, int plane,
+                                         const BUFFER_SET *ctx, int plane,
                                         BLOCK_SIZE bsize) {
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
    DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
    av1_build_intra_predictors_for_interintra(
        cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor),
@@ -1206,7 +1159,8 @@ void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
 void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          uint8_t *upred, uint8_t *vpred,
                                          int ustride, int vstride,
-                                          BUFFER_SET *ctx, BLOCK_SIZE bsize) {
+                                          const BUFFER_SET *ctx,
+                                          BLOCK_SIZE bsize) {
  av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize);
  av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize);
 }
@@ -161,8 +161,6 @@ static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
 void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi);
 int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize,
                               const struct macroblockd_plane *pd, int dir);
-int av1_check_identical_obmc_mv_field(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                      int mi_row, int mi_col);

 static INLINE int is_interinter_compound_used(COMPOUND_TYPE type,
                                              BLOCK_SIZE sb_type) {
@@ -335,17 +333,18 @@ const uint8_t *av1_get_compound_type_mask(
 // build interintra_predictors for one plane
 void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                         uint8_t *pred, int stride,
-                                         BUFFER_SET *ctx, int plane,
+                                         const BUFFER_SET *ctx, int plane,
                                         BLOCK_SIZE bsize);

 void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          uint8_t *upred, uint8_t *vpred,
                                          int ustride, int vstride,
-                                          BUFFER_SET *ctx, BLOCK_SIZE bsize);
+                                          const BUFFER_SET *ctx,
+                                          BLOCK_SIZE bsize);

 void av1_build_intra_predictors_for_interintra(
    const AV1_COMMON *cm, MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
-    BUFFER_SET *ctx, uint8_t *intra_pred, int intra_stride);
+    const BUFFER_SET *ctx, uint8_t *intra_pred, int intra_stride);

 void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
                            const uint8_t *inter_pred, int inter_stride,
@@ -1510,7 +1510,7 @@ void av1_predict_intra_block(
                               xd->color_index_map_offset[plane != 0];
    const uint16_t *const palette =
        mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE;
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (is_cur_buf_hbd(xd)) {
      uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
      for (r = 0; r < txhpx; ++r) {
        for (c = 0; c < txwpx; ++c) {
@@ -1569,7 +1569,7 @@ void av1_predict_intra_block(
      tx_size, row_off, col_off, pd->subsampling_x, pd->subsampling_y);

  const int disable_edge_filter = !cm->seq_params.enable_intra_edge_filter;
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
    build_intra_predictors_high(
        xd, ref, ref_stride, dst, dst_stride, mode, angle_delta,
        filter_intra_mode, tx_size, disable_edge_filter,
@@ -64,6 +64,9 @@

 #define ACCT_STR __func__

+#define AOM_MIN_THREADS_PER_TILE 1
+#define AOM_MAX_THREADS_PER_TILE 2
+
 // This is needed by ext_tile related unit tests.
 #define EXT_TILE_DEBUG 1
 #define MC_TEMP_BUF_PELS                       \
@@ -734,7 +737,7 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
                               &scaled_mv, &subpel_x_mv, &subpel_y_mv);
        pre = pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
        src_stride = pre_buf->stride;
-        highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+        highbd = is_cur_buf_hbd(xd);
        extend_mc_border(sf, pre_buf, scaled_mv, block, subpel_x_mv,
                         subpel_y_mv, 0, is_intrabc, highbd, xd->mc_buf[ref],
                         &pre, &src_stride);
@@ -780,7 +783,7 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
                             &scaled_mv, &subpel_x_mv, &subpel_y_mv);
      pre[ref] = pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
      src_stride[ref] = pre_buf->stride;
-      highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+      highbd = is_cur_buf_hbd(xd);

      WarpTypesAllowed warp_types;
      warp_types.global_warp_allowed = is_global[ref];
@@ -853,7 +856,7 @@ static void dec_build_inter_predictors_for_planes(const AV1_COMMON *cm,

 static void dec_build_inter_predictors_sby(const AV1_COMMON *cm,
                                           MACROBLOCKD *xd, int mi_row,
-                                           int mi_col, BUFFER_SET *ctx,
+                                           int mi_col, const BUFFER_SET *ctx,
                                           BLOCK_SIZE bsize) {
  dec_build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 0, 0);

@@ -868,7 +871,7 @@ static void dec_build_inter_predictors_sby(const AV1_COMMON *cm,

 static void dec_build_inter_predictors_sbuv(const AV1_COMMON *cm,
                                            MACROBLOCKD *xd, int mi_row,
-                                            int mi_col, BUFFER_SET *ctx,
+                                            int mi_col, const BUFFER_SET *ctx,
                                            BLOCK_SIZE bsize) {
  dec_build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 1,
                                        MAX_MB_PLANE - 1);
@@ -1013,7 +1016,7 @@ static void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };

-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
    int len = sizeof(uint16_t);
    dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
    dst_buf1[1] =
@@ -3361,6 +3364,20 @@ static int tile_worker_hook(void *arg1, void *arg2) {
  return !td->xd.corrupted;
 }

+static INLINE int get_max_row_mt_workers_per_tile(AV1_COMMON *cm,
+                                                  TileInfo tile) {
+  // NOTE: Currently value of max workers is calculated based
+  // on the parse and decode time. As per the theoretical estimate
+  // when percentage of parse time is equal to percentage of decode
+  // time, number of workers needed to parse + decode a tile can not
+  // exceed more than 2.
+  // TODO(any): Modify this value if parsing is optimized in future.
+  int sb_rows = av1_get_sb_rows_in_tile(cm, tile);
+  int max_workers =
+      sb_rows == 1 ? AOM_MIN_THREADS_PER_TILE : AOM_MAX_THREADS_PER_TILE;
+  return max_workers;
+}
+
 // The caller must hold pbi->row_mt_mutex_ when calling this function.
 // Returns 1 if either the next job is stored in *next_job_info or 1 is stored
 // in *end_of_frame.
@@ -3391,8 +3408,8 @@ static int get_next_job_info(AV1Decoder *const pbi,
  int min_threads_working = INT_MAX;
  int max_mis_to_decode = 0;
  int tile_row_idx, tile_col_idx;
-  int tile_row = 0;
-  int tile_col = 0;
+  int tile_row = -1;
+  int tile_col = -1;

  memset(next_job_info, 0, sizeof(*next_job_info));

@@ -3440,7 +3457,9 @@ static int get_next_job_info(AV1Decoder *const pbi,
          max_mis_to_decode = 0;
        }
        if (num_threads_working == min_threads_working &&
-            num_mis_to_decode > max_mis_to_decode) {
+            num_mis_to_decode > max_mis_to_decode &&
+            num_threads_working <
+                get_max_row_mt_workers_per_tile(cm, tile_data->tile_info)) {
          max_mis_to_decode = num_mis_to_decode;
          tile_row = tile_row_idx;
          tile_col = tile_col_idx;
@@ -3448,6 +3467,8 @@ static int get_next_job_info(AV1Decoder *const pbi,
      }
    }
  }
+  // No job found to process
+  if (tile_row == -1 || tile_col == -1) return 0;

  tile_data = pbi->tile_data + tile_row * cm->tile_cols + tile_col;
  tile_info = tile_data->tile_info;
@@ -3576,9 +3597,22 @@ static int row_mt_worker_hook(void *arg1, void *arg2) {
      TileDataDec *const tile_data = cur_job_info->tile_data;
      tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data,
                            allow_update_cdf);
-
+#if CONFIG_MULTITHREAD
+      pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+      tile_data->dec_row_mt_sync.num_threads_working++;
+#if CONFIG_MULTITHREAD
+      pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
      // decode tile
      parse_tile_row_mt(pbi, td, tile_data);
+#if CONFIG_MULTITHREAD
+      pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+      tile_data->dec_row_mt_sync.num_threads_working--;
+#if CONFIG_MULTITHREAD
+      pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
    } else {
      break;
    }
@@ -4055,7 +4089,8 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
  int tile_cols_start;
  int tile_cols_end;
  int tile_count_tg;
-  int num_workers;
+  int num_workers = 0;
+  int max_threads;
  const uint8_t *raw_data_end = NULL;
  int max_sb_rows = 0;

@@ -4071,7 +4106,7 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
    tile_cols_end = tile_cols;
  }
  tile_count_tg = end_tile - start_tile + 1;
-  num_workers = pbi->max_threads;
+  max_threads = pbi->max_threads;

  // No tiles to decode.
  if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
@@ -4084,7 +4119,7 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
  assert(tile_rows <= MAX_TILE_ROWS);
  assert(tile_cols <= MAX_TILE_COLS);
  assert(tile_count_tg > 0);
-  assert(num_workers > 0);
+  assert(max_threads > 0);
  assert(start_tile <= end_tile);
  assert(start_tile >= 0 && end_tile < n_tiles);

@@ -4116,8 +4151,10 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,

      max_sb_rows = AOMMAX(max_sb_rows,
                           av1_get_sb_rows_in_tile(cm, tile_data->tile_info));
+      num_workers += get_max_row_mt_workers_per_tile(cm, tile_data->tile_info);
    }
  }
+  num_workers = AOMMIN(num_workers, max_threads);

  if (pbi->allocated_row_mt_sync_rows != max_sb_rows) {
    for (int i = 0; i < n_tiles; ++i) {
@@ -4817,7 +4854,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
    cm->error_resilient_mode = 1;
  } else {
    cm->show_existing_frame = aom_rb_read_bit(rb);
-    cm->reset_decoder_state = 0;
+    pbi->reset_decoder_state = 0;

    if (cm->show_existing_frame) {
      if (pbi->sequence_header_changed) {
@@ -4859,7 +4896,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
      // assign_frame_buffer_p()!
      assert(!cm->cur_frame->raw_frame_buffer.data);
      assign_frame_buffer_p(&cm->cur_frame, frame_to_show);
-      cm->reset_decoder_state = frame_to_show->frame_type == KEY_FRAME;
+      pbi->reset_decoder_state = frame_to_show->frame_type == KEY_FRAME;
      unlock_buffer_pool(pool);

      cm->lf.filter_level[0] = 0;
@@ -4869,11 +4906,11 @@ static int read_uncompressed_header(AV1Decoder *pbi,
      if (!frame_to_show->showable_frame) {
        aom_merge_corrupted_flag(&xd->corrupted, 1);
      }
-      if (cm->reset_decoder_state) frame_to_show->showable_frame = 0;
+      if (pbi->reset_decoder_state) frame_to_show->showable_frame = 0;

      cm->film_grain_params = frame_to_show->film_grain_params;

-      if (cm->reset_decoder_state) {
+      if (pbi->reset_decoder_state) {
        show_existing_frame_reset(pbi, existing_frame_idx);
      } else {
        current_frame->refresh_frame_flags = 0;
@@ -5471,7 +5508,7 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
  if (cm->show_existing_frame) {
    // showing a frame directly
    *p_data_end = data + uncomp_hdr_size;
-    if (cm->reset_decoder_state) {
+    if (pbi->reset_decoder_state) {
      // Use the default frame context values.
      *cm->fc = *cm->default_frame_context;
      if (!cm->fc->initialized)
@@ -699,7 +699,8 @@ static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
                                     mi_col, bsize, r);
    if (!valid_dv) {
      // Intra bc motion vectors are not valid - signal corrupt frame
-      aom_merge_corrupted_flag(&xd->corrupted, 1);
+      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid intrabc dv");
    }
  }
 }
@@ -361,7 +361,7 @@ static void swap_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
      assert(IMPLIES(!pbi->hold_ref_buf,
                     cm->current_frame.refresh_frame_flags == 0));
      assert(IMPLIES(!pbi->hold_ref_buf,
-                     cm->show_existing_frame && !cm->reset_decoder_state));
+                     cm->show_existing_frame && !pbi->reset_decoder_state));

      // The following two for loops need to release the reference stored in
      // cm->ref_frame_map[ref_index] before transferring the reference stored
@@ -374,7 +374,7 @@ static void swap_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
      }

      const int check_on_show_existing_frame =
-          !cm->show_existing_frame || cm->reset_decoder_state;
+          !cm->show_existing_frame || pbi->reset_decoder_state;
      for (; ref_index < REF_FRAMES && check_on_show_existing_frame;
           ++ref_index) {
        decrease_ref_count(cm->ref_frame_map[ref_index], pool);
@@ -200,6 +200,7 @@ typedef struct AV1Decoder {
  int need_resync;   // wait for key/intra-only frame.
  int hold_ref_buf;  // Boolean: whether we are holding reference buffers in
                     // common.next_ref_frame_map.
+  int reset_decoder_state;

  int tile_size_bytes;
  int tile_col_size_bytes;
@@ -155,9 +155,6 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
    return 0;
  }

-  memset(levels_buf, 0,
-         sizeof(*levels_buf) *
-             ((width + TX_PAD_HOR) * (height + TX_PAD_VER) + TX_PAD_END));
  if (plane == AOM_PLANE_Y) {
    // only y plane's tx_type is transmitted
    av1_read_tx_type(cm, xd, blk_row, blk_col, tx_size, r);
@@ -241,6 +238,12 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
  }
  *eob = rec_eob_pos(eob_pt, eob_extra);

+  if (*eob > 1) {
+    memset(levels_buf, 0,
+           sizeof(*levels_buf) *
+               ((width + TX_PAD_HOR) * (height + TX_PAD_VER) + TX_PAD_END));
+  }
+
  {
    // Read the non-zero coefficient with scan index eob-1
    // TODO(angiebird): Put this into a function
@@ -252,7 +255,7 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
        ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx];
    int level = aom_read_symbol(r, cdf, nsymbs, ACCT_STR) + 1;
    if (level > NUM_BASE_LEVELS) {
-      const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+      const int br_ctx = get_br_ctx_eob(pos, bwl, tx_class);
      cdf = ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx];
      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
        const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
@@ -121,7 +121,7 @@ int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {

  for (i = 0; i < bh; i += 4) {
    for (j = 0; j < bw; j += 4) {
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (is_cur_buf_hbd(xd)) {
        var +=
            log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
                          x->plane[0].src.buf + i * x->plane[0].src.stride + j,
@@ -153,7 +153,7 @@ static unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
  uint8_t *buf = x->plane[0].src.buf;
  const int bw = MI_SIZE * mi_size_wide[bs];
  const int bh = MI_SIZE * mi_size_high[bs];
-  int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int hbd = is_cur_buf_hbd(xd);

  int var = 0;
  for (int r = 0; r < bh; r += 8)
@@ -41,47 +41,37 @@ static void quantize_fp_helper_c(
    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
    const qm_val_t *iqm_ptr, int log_scale) {
  int i, eob = -1;
+  const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+                            ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
  // TODO(jingning) Decide the need of these arguments after the
  // quantization process is completed.
  (void)zbin_ptr;
  (void)quant_shift_ptr;
+  (void)iscan;

  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

  if (qm_ptr == NULL && iqm_ptr == NULL) {
-    const int rounding0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
-    {  // rc == 0
-      const int coeff = coeff_ptr[0];
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]);
+      const int coeff = coeff_ptr[rc];
      const int coeff_sign = (coeff >> 31);
      int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      if ((abs_coeff << (1 + log_scale)) >= (int32_t)(dequant_ptr[0])) {
-        abs_coeff = clamp64(abs_coeff + rounding0, INT16_MIN, INT16_MAX);
-        const int tmp32 = (int)((abs_coeff * quant_ptr[0]) >> (16 - log_scale));
+      int tmp32 = 0;
+      if ((abs_coeff << (1 + log_scale)) >= thresh) {
+        abs_coeff =
+            clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX);
+        tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
        if (tmp32) {
-          qcoeff_ptr[0] = (tmp32 ^ coeff_sign) - coeff_sign;
-          const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[0]) >> log_scale;
-          dqcoeff_ptr[0] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
-          eob = 0;
-        }
-      }
-    }
-    const int rounding1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
-    const int32_t thresh1 = (int32_t)(dequant_ptr[1]);
-    for (i = 1; i < n_coeffs; i++) {
-      const int coeff = coeff_ptr[i];
-      const int coeff_sign = (coeff >> 31);
-      int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      if ((abs_coeff << (1 + log_scale)) >= thresh1) {
-        abs_coeff = clamp64(abs_coeff + rounding1, INT16_MIN, INT16_MAX);
-        const int tmp32 = (int)((abs_coeff * quant_ptr[1]) >> (16 - log_scale));
-        if (tmp32) {
-          qcoeff_ptr[i] = (tmp32 ^ coeff_sign) - coeff_sign;
-          const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[1]) >> log_scale;
-          dqcoeff_ptr[i] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
-          eob = AOMMAX(iscan[i], eob);
+          qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+          const tran_low_t abs_dqcoeff =
+              (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
+          dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
        }
      }
+      if (tmp32) eob = i;
    }
  } else {
    // Quantization pass: All coefficients with index >= zero_flag are
@@ -99,7 +89,7 @@ static void quantize_fp_helper_c(
      int tmp32 = 0;
      if (abs_coeff * wt >=
          (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
-        abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+        abs_coeff += rounding[rc != 0];
        abs_coeff = clamp64(abs_coeff, INT16_MIN, INT16_MAX);
        tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >>
                      (16 - log_scale + AOM_QM_BITS));
@@ -275,32 +265,65 @@ void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
  const qm_val_t *qm_ptr = qparam->qmatrix;
  const qm_val_t *iqm_ptr = qparam->iqmatrix;
-  if (qm_ptr != NULL && iqm_ptr != NULL) {
-    quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                        p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                        dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                        sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+  if (qparam->use_quant_b_adapt) {
+    // TODO(sarahparker) These quantize_b optimizations need SIMD
+    // implementations
+    if (qm_ptr != NULL && iqm_ptr != NULL) {
+      quantize_b_adaptive_helper_c(
+          coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+          p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+          sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+    } else {
+      switch (qparam->log_scale) {
+        case 0:
+          aom_quantize_b_adaptive_c(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        case 1:
+          aom_quantize_b_32x32_adaptive_c(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        case 2:
+          aom_quantize_b_64x64_adaptive_c(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        default: assert(0);
+      }
+    }
  } else {
-    switch (qparam->log_scale) {
-      case 0:
-        aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                       p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                       dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                       sc->iscan);
-        break;
-      case 1:
-        aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                             p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                             dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                             sc->iscan);
-        break;
-      case 2:
-        aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                             p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                             dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                             sc->iscan);
-        break;
-      default: assert(0);
+    if (qm_ptr != NULL && iqm_ptr != NULL) {
+      quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                          p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                          dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                          sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+    } else {
+      switch (qparam->log_scale) {
+        case 0:
+          aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                         p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                         dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                         sc->iscan);
+          break;
+        case 1:
+          aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                               p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                               dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                               sc->iscan);
+          break;
+        case 2:
+          aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                               p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                               dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                               sc->iscan);
+          break;
+        default: assert(0);
+      }
    }
  }
 }
@@ -391,41 +414,81 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
                                  const QUANT_PARAM *qparam) {
  const qm_val_t *qm_ptr = qparam->qmatrix;
  const qm_val_t *iqm_ptr = qparam->iqmatrix;
-  if (qm_ptr != NULL && iqm_ptr != NULL) {
-    highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                               p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                               dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                               sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+  if (qparam->use_quant_b_adapt) {
+    if (qm_ptr != NULL && iqm_ptr != NULL) {
+      highbd_quantize_b_adaptive_helper_c(
+          coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+          p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+          sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+    } else {
+      switch (qparam->log_scale) {
+        case 0:
+          if (LIKELY(n_coeffs >= 8)) {
+            aom_highbd_quantize_b_adaptive_c(
+                coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+                p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+                eob_ptr, sc->scan, sc->iscan);
+          } else {
+            // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
+            // quantization
+            aom_highbd_quantize_b_adaptive_c(
+                coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+                p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+                eob_ptr, sc->scan, sc->iscan);
+          }
+          break;
+        case 1:
+          aom_highbd_quantize_b_32x32_adaptive_c(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        case 2:
+          aom_highbd_quantize_b_64x64_adaptive_c(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        default: assert(0);
+      }
+    }
  } else {
-    switch (qparam->log_scale) {
-      case 0:
-        if (LIKELY(n_coeffs >= 8)) {
-          aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                                p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                                dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                                sc->iscan);
-        } else {
-          // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
-          // quantization
-          aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, p->zbin_QTX,
+    if (qm_ptr != NULL && iqm_ptr != NULL) {
+      highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                                 p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                                 dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                                 sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+    } else {
+      switch (qparam->log_scale) {
+        case 0:
+          if (LIKELY(n_coeffs >= 8)) {
+            aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX,
                                  p->round_QTX, p->quant_QTX,
                                  p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
                                  p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
-        }
-        break;
-      case 1:
-        aom_highbd_quantize_b_32x32(
-            coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
-            p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
-            eob_ptr, sc->scan, sc->iscan);
-        break;
-      case 2:
-        aom_highbd_quantize_b_64x64(
-            coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
-            p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
-            eob_ptr, sc->scan, sc->iscan);
-        break;
-      default: assert(0);
+          } else {
+            // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
+            // quantization
+            aom_highbd_quantize_b_c(
+                coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+                p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+                eob_ptr, sc->scan, sc->iscan);
+          }
+          break;
+        case 1:
+          aom_highbd_quantize_b_32x32(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        case 2:
+          aom_highbd_quantize_b_64x64(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        default: assert(0);
+      }
    }
  }
 }
@@ -27,6 +27,7 @@ typedef struct QUANT_PARAM {
  TX_SIZE tx_size;
  const qm_val_t *qmatrix;
  const qm_val_t *iqmatrix;
+  int use_quant_b_adapt;
 } QUANT_PARAM;

 typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -54,10 +54,10 @@ typedef struct macroblock_plane {
 typedef struct {
  int txb_skip_cost[TXB_SKIP_CONTEXTS][2];
  int base_eob_cost[SIG_COEF_CONTEXTS_EOB][3];
-  int base_cost[SIG_COEF_CONTEXTS][4];
+  int base_cost[SIG_COEF_CONTEXTS][8];
  int eob_extra_cost[EOB_COEF_CONTEXTS][2];
  int dc_sign_cost[DC_SIGN_CONTEXTS][2];
-  int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1];
+  int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1];
 } LV_MAP_COEFF_COST;

 typedef struct {
@@ -192,16 +192,14 @@ typedef struct {
  int32_t rate[COMPOUND_TYPES];
  int64_t dist[COMPOUND_TYPES];
  int_mv mv[2];
-  int8_t ref_frames[2];
+  MV_REFERENCE_FRAME ref_frames[2];
  PREDICTION_MODE mode;
  InterpFilters filter;
  int ref_mv_idx;
  int is_global[2];
 } COMP_RD_STATS;

-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
 struct inter_modes_info;
-#endif
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
  struct macroblock_plane plane[MAX_MB_PLANE];
@@ -300,9 +298,7 @@ struct macroblock {
  // to the accurate tile context.
  FRAME_CONTEXT *tile_pb_ctx;

-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
  struct inter_modes_info *inter_modes_info;
-#endif

  // buffer for hash value calculation of a block
  // used only in av1_get_block_hash_value()
@@ -30,6 +30,10 @@ extern const uint16_t av1_prob_cost[128];

 // Calculate the cost of a symbol with probability p15 / 2^15
 static INLINE int av1_cost_symbol(aom_cdf_prob p15) {
+  // p15 can be out of range [1, CDF_PROB_TOP - 1]. Clamping it, so that the
+  // following cost calculation works correctly. Otherwise, if p15 =
+  // CDF_PROB_TOP, shift would be -1, and "p15 << shift" would be wrong.
+  p15 = (aom_cdf_prob)clamp(p15, 1, CDF_PROB_TOP - 1);
  assert(0 < p15 && p15 < CDF_PROB_TOP);
  const int shift = CDF_PROB_BITS - 1 - get_msb(p15);
  const int prob = get_prob(p15 << shift, CDF_PROB_TOP);
@@ -11,12 +11,244 @@

 #include <stdint.h>

+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
 #include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#include "aom_ports/system_state.h"
+
+#if CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_MISMATCH_DEBUG

 #include "av1/common/onyxc_int.h"

 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/tpl_model.h"
+
+// Define the reference buffers that will be updated post encode.
+void av1_configure_buffer_updates(AV1_COMP *cpi, const FRAME_UPDATE_TYPE type) {
+  // NOTE(weitinglin): Should we define another function to take care of
+  // cpi->rc.is_$Source_Type to make this function as it is in the comment?
+
+  // show_existing_frame is a flag left set from the end of encoding the
+  // previous frame.  Alongside it, is_src_frame_alt_ref may also be left
+  // set so shouldn't be cleared in this case.
+  if (!cpi->common.show_existing_frame) cpi->rc.is_src_frame_alt_ref = 0;
+
+  cpi->rc.is_bwd_ref_frame = 0;
+  cpi->rc.is_last_bipred_frame = 0;
+  cpi->rc.is_bipred_frame = 0;
+  cpi->rc.is_src_frame_ext_arf = 0;
+
+  switch (type) {
+    case KF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_bwd_ref_frame = 1;
+      cpi->refresh_alt2_ref_frame = 1;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+
+    case LF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+
+    case GF_UPDATE:
+      // TODO(zoeliu): To further investigate whether 'refresh_last_frame' is
+      //               needed.
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+
+    case OVERLAY_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+
+      cpi->rc.is_src_frame_alt_ref = 1;
+      break;
+
+    case ARF_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+
+    case BRF_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 1;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+
+      cpi->rc.is_bwd_ref_frame = 1;
+      break;
+
+    case LAST_BIPRED_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+
+      cpi->rc.is_last_bipred_frame = 1;
+      break;
+
+    case BIPRED_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+
+      cpi->rc.is_bipred_frame = 1;
+      break;
+
+    case INTNL_OVERLAY_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+
+      cpi->rc.is_src_frame_alt_ref = 1;
+      cpi->rc.is_src_frame_ext_arf = 1;
+      break;
+
+    case INTNL_ARF_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      if (cpi->new_bwdref_update_rule == 1 && cpi->oxcf.pass == 2) {
+        cpi->refresh_bwd_ref_frame = 1;
+        cpi->refresh_alt2_ref_frame = 0;
+      } else {
+        cpi->refresh_bwd_ref_frame = 0;
+        cpi->refresh_alt2_ref_frame = 1;
+      }
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+
+    default: assert(0); break;
+  }
+}
+
+static void set_additional_frame_flags(const AV1_COMMON *const cm,
+                                       unsigned int *const frame_flags) {
+  if (frame_is_intra_only(cm)) *frame_flags |= FRAMEFLAGS_INTRAONLY;
+  if (frame_is_sframe(cm)) *frame_flags |= FRAMEFLAGS_SWITCH;
+  if (cm->error_resilient_mode) *frame_flags |= FRAMEFLAGS_ERROR_RESILIENT;
+}
+
+static INLINE void update_keyframe_counters(AV1_COMP *cpi) {
+  // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
+  //               differently here for rc->avg_frame_bandwidth.
+  if (cpi->common.show_frame || cpi->rc.is_bwd_ref_frame) {
+    if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
+        cpi->common.current_frame.frame_type == KEY_FRAME) {
+      // If this is a show_existing_frame with a source other than altref,
+      // or if it is not a displayed forward keyframe, the keyframe update
+      // counters were incremented when it was originally encoded.
+      cpi->rc.frames_since_key++;
+      cpi->rc.frames_to_key--;
+    }
+  }
+}
+
+static INLINE int is_frame_droppable(const AV1_COMP *const cpi) {
+  return !(cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame ||
+           cpi->refresh_bwd_ref_frame || cpi->refresh_golden_frame ||
+           cpi->refresh_last_frame);
+}
+
+static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
+  // TODO(weitinglin): Updating this counter for is_frame_droppable
+  // is a work-around to handle the condition when a frame is drop.
+  // We should fix the cpi->common.show_frame flag
+  // instead of checking the other condition to update the counter properly.
+  if (cpi->common.show_frame || is_frame_droppable(cpi)) {
+    // Decrement count down till next gf
+    if (cpi->rc.frames_till_gf_update_due > 0)
+      cpi->rc.frames_till_gf_update_due--;
+  }
+}
+
+static INLINE void update_twopass_gf_group_index(AV1_COMP *cpi) {
+  // Increment the gf group index ready for the next frame. If this is
+  // a show_existing_frame with a source other than altref, or if it is not
+  // a displayed forward keyframe, the index was incremented when it was
+  // originally encoded.
+  if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
+      cpi->common.current_frame.frame_type == KEY_FRAME) {
+    ++cpi->twopass.gf_group.index;
+  }
+}
+
+static void update_rc_counts(AV1_COMP *cpi) {
+  update_keyframe_counters(cpi);
+  update_frames_till_gf_update(cpi);
+  if (cpi->oxcf.pass == 2) update_twopass_gf_group_index(cpi);
+}
+
+static void check_show_existing_frame(AV1_COMP *cpi) {
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  AV1_COMMON *const cm = &cpi->common;
+  const FRAME_UPDATE_TYPE next_frame_update_type =
+      gf_group->update_type[gf_group->index];
+  const int which_arf = (cpi->new_bwdref_update_rule == 1)
+                            ? gf_group->arf_update_idx[gf_group->index] > 0
+                            : gf_group->arf_update_idx[gf_group->index];
+
+  if (cm->show_existing_frame == 1) {
+    cm->show_existing_frame = 0;
+  } else if (cpi->rc.is_last_bipred_frame) {
+    // NOTE: When new structure is used, every bwdref will have one overlay
+    //       frame. Therefore, there is no need to find out which frame to
+    //       show in advance.
+    if (cpi->new_bwdref_update_rule == 0) {
+      // NOTE: If the current frame is a last bi-predictive frame, it is
+      //       needed next to show the BWDREF_FRAME, which is pointed by
+      //       the last_fb_idxes[0] after reference frame buffer update
+      cpi->rc.is_last_bipred_frame = 0;
+      cm->show_existing_frame = 1;
+      cpi->existing_fb_idx_to_show = cm->remapped_ref_idx[0];
+    }
+  } else if (cpi->is_arf_filter_off[which_arf] &&
+             (next_frame_update_type == OVERLAY_UPDATE ||
+              next_frame_update_type == INTNL_OVERLAY_UPDATE)) {
+    const int bwdref_to_show =
+        (cpi->new_bwdref_update_rule == 1) ? BWDREF_FRAME : ALTREF2_FRAME;
+    // Other parameters related to OVERLAY_UPDATE will be taken care of
+    // in av1_rc_get_second_pass_params(cpi)
+    cm->show_existing_frame = 1;
+    cpi->rc.is_src_frame_alt_ref = 1;
+    cpi->existing_fb_idx_to_show =
+        (next_frame_update_type == OVERLAY_UPDATE)
+            ? get_ref_frame_map_idx(cm, ALTREF_FRAME)
+            : get_ref_frame_map_idx(cm, bwdref_to_show);
+    if (cpi->new_bwdref_update_rule == 0) {
+      cpi->is_arf_filter_off[which_arf] = 0;
+    }
+  }
+  cpi->rc.is_src_frame_ext_arf = 0;
+}

 static void set_ext_overrides(AV1_COMP *const cpi,
                              EncodeFrameParams *const frame_params) {
@@ -28,9 +260,8 @@ static void set_ext_overrides(AV1_COMP *const cpi,
  AV1_COMMON *const cm = &cpi->common;

  if (cpi->ext_use_s_frame) {
-    cm->current_frame.frame_type = S_FRAME;
+    frame_params->frame_type = S_FRAME;
  }
-  cm->force_primary_ref_none = cpi->ext_use_primary_ref_none;

  if (cpi->ext_refresh_frame_context_pending) {
    cm->refresh_frame_context = cpi->ext_refresh_frame_context;
@@ -50,10 +281,9 @@ static void set_ext_overrides(AV1_COMP *const cpi,
  // A keyframe is already error resilient and keyframes with
  // error_resilient_mode interferes with the use of show_existing_frame
  // when forward reference keyframes are enabled.
-  frame_params->error_resilient_mode &=
-      cm->current_frame.frame_type != KEY_FRAME;
+  frame_params->error_resilient_mode &= frame_params->frame_type != KEY_FRAME;
  // For bitstream conformance, s-frames must be error-resilient
-  frame_params->error_resilient_mode |= frame_is_sframe(cm);
+  frame_params->error_resilient_mode |= frame_params->frame_type == S_FRAME;
 }

 static int get_ref_frame_flags(const AV1_COMP *const cpi) {
@@ -130,10 +360,474 @@ static int get_ref_frame_flags(const AV1_COMP *const cpi) {
  return flags;
 }

+static int get_current_frame_ref_type(
+    const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) {
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  // We choose the reference "type" of this frame from the flags which indicate
+  // which reference frames will be refreshed by it.  More than one of these
+  // flags may be set, so the order here implies an order of precedence.
+  // This is just used to choose the primary_ref_frame (as the most recent
+  // reference buffer of the same reference-type as the current frame)
+
+  const int intra_only = frame_params->frame_type == KEY_FRAME ||
+                         frame_params->frame_type == INTRA_ONLY_FRAME;
+  if (intra_only || frame_params->error_resilient_mode ||
+      cpi->ext_use_primary_ref_none)
+    return REGULAR_FRAME;
+  else if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE)
+    return EXT_ARF_FRAME;
+  else if (cpi->refresh_alt_ref_frame)
+    return ARF_FRAME;
+  else if (cpi->rc.is_src_frame_alt_ref)
+    return OVERLAY_FRAME;
+  else if (cpi->refresh_golden_frame)
+    return GLD_FRAME;
+  else if (cpi->refresh_bwd_ref_frame)
+    return BRF_FRAME;
+  else
+    return REGULAR_FRAME;
+}
+
+static int choose_primary_ref_frame(
+    const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  const int intra_only = frame_params->frame_type == KEY_FRAME ||
+                         frame_params->frame_type == INTRA_ONLY_FRAME;
+  if (intra_only || frame_params->error_resilient_mode ||
+      cpi->ext_use_primary_ref_none) {
+    return PRIMARY_REF_NONE;
+  }
+
+  // Find the most recent reference frame with the same reference type as the
+  // current frame
+  const FRAME_CONTEXT_INDEX current_ref_type =
+      get_current_frame_ref_type(cpi, frame_params);
+  int wanted_fb = cpi->fb_of_context_type[current_ref_type];
+
+  int primary_ref_frame = PRIMARY_REF_NONE;
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb) {
+      primary_ref_frame = ref_frame - LAST_FRAME;
+    }
+  }
+  return primary_ref_frame;
+}
+
+static void update_fb_of_context_type(
+    const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params,
+    int *const fb_of_context_type) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+      cpi->ext_use_primary_ref_none) {
+    for (int i = 0; i < REF_FRAMES; i++) {
+      fb_of_context_type[i] = -1;
+    }
+    fb_of_context_type[REGULAR_FRAME] =
+        cm->show_frame ? get_ref_frame_map_idx(cm, GOLDEN_FRAME)
+                       : get_ref_frame_map_idx(cm, ALTREF_FRAME);
+  }
+
+  if (!encode_show_existing_frame(cm)) {
+    // Refresh fb_of_context_type[]: see encoder.h for explanation
+    // Note that we want the value of refresh_frame_flags for the frame that
+    // just happened.  If we call get_refresh_frame_flags now we will get a
+    // different answer, because update_reference_frames() has happened.
+    if (cm->current_frame.frame_type == KEY_FRAME) {
+      // All ref frames are refreshed, pick one that will live long enough
+      fb_of_context_type[REGULAR_FRAME] = 0;
+    } else {
+      // If more than one frame is refreshed, it doesn't matter which one we
+      // pick so pick the first.  LST sometimes doesn't refresh any: this is ok
+      const int current_frame_ref_type =
+          get_current_frame_ref_type(cpi, frame_params);
+      for (int i = 0; i < REF_FRAMES; i++) {
+        if (cm->current_frame.refresh_frame_flags & (1 << i)) {
+          fb_of_context_type[current_frame_ref_type] = i;
+          break;
+        }
+      }
+    }
+  }
+}
+
+static int get_order_offset(const AV1_COMP *const cpi,
+                            const EncodeFrameParams *const frame_params) {
+  // shown frame by definition has order offset 0
+  // show_existing_frame ignores order_offset and simply takes the order_hint
+  // from the reference frame being shown.
+  if (frame_params->show_frame || cpi->common.show_existing_frame) return 0;
+
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  const int arf_offset =
+      AOMMIN((MAX_GF_INTERVAL - 1), gf_group->arf_src_offset[gf_group->index]);
+  const int brf_offset = gf_group->brf_src_offset[gf_group->index];
+  return AOMMIN((MAX_GF_INTERVAL - 1), arf_offset + brf_offset);
+}
+
+static void adjust_frame_rate(AV1_COMP *cpi,
+                              const struct lookahead_entry *source) {
+  int64_t this_duration;
+  int step = 0;
+
+  // Clear down mmx registers
+  aom_clear_system_state();
+
+  if (source->ts_start == cpi->first_time_stamp_ever) {
+    this_duration = source->ts_end - source->ts_start;
+    step = 1;
+  } else {
+    int64_t last_duration =
+        cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen;
+
+    this_duration = source->ts_end - cpi->last_end_time_stamp_seen;
+
+    // do a step update if the duration changes by 10%
+    if (last_duration)
+      step = (int)((this_duration - last_duration) * 10 / last_duration);
+  }
+
+  if (this_duration) {
+    if (step) {
+      av1_new_framerate(cpi, 10000000.0 / this_duration);
+    } else {
+      // Average this frame's rate into the last second's average
+      // frame rate. If we haven't seen 1 second yet, then average
+      // over the whole interval seen.
+      const double interval = AOMMIN(
+          (double)(source->ts_end - cpi->first_time_stamp_ever), 10000000.0);
+      double avg_duration = 10000000.0 / cpi->framerate;
+      avg_duration *= (interval - avg_duration + this_duration);
+      avg_duration /= interval;
+
+      av1_new_framerate(cpi, 10000000.0 / avg_duration);
+    }
+  }
+  cpi->last_time_stamp_seen = source->ts_start;
+  cpi->last_end_time_stamp_seen = source->ts_end;
+}
+
+static void check_src_altref(AV1_COMP *cpi,
+                             const struct lookahead_entry *source) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // If pass == 2, the parameters set here will be reset in
+  // av1_rc_get_second_pass_params()
+
+  if (cpi->oxcf.pass == 2) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    rc->is_src_frame_alt_ref =
+        (gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE) ||
+        (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
+    rc->is_src_frame_ext_arf =
+        gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE;
+  } else {
+    rc->is_src_frame_alt_ref =
+        cpi->alt_ref_source && (source == cpi->alt_ref_source);
+  }
+
+  if (rc->is_src_frame_alt_ref) {
+    // Current frame is an ARF overlay frame.
+    cpi->alt_ref_source = NULL;
+
+    if (rc->is_src_frame_ext_arf && !cpi->common.show_existing_frame) {
+      // For INTNL_OVERLAY, when show_existing_frame == 0, they do need to
+      // refresh the LAST_FRAME, i.e. LAST3 gets retired, LAST2 becomes LAST3,
+      // LAST becomes LAST2, and INTNL_OVERLAY becomes LAST.
+      cpi->refresh_last_frame = 1;
+    } else {
+      // Don't refresh the last buffer for an ARF overlay frame. It will
+      // become the GF so preserve last as an alternative prediction option.
+      cpi->refresh_last_frame = 0;
+    }
+  }
+}
+
+// Returns 0 if this is not an alt ref else the offset of the source frame
+// used as the arf midpoint.
+static int get_arf_src_index(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int arf_src_index = 0;
+  if (is_altref_enabled(cpi)) {
+    if (cpi->oxcf.pass == 2) {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+        arf_src_index = gf_group->arf_src_offset[gf_group->index];
+      }
+    } else if (rc->source_alt_ref_pending) {
+      arf_src_index = rc->frames_till_gf_update_due;
+    }
+  }
+  return arf_src_index;
+}
+
+static int get_brf_src_index(AV1_COMP *cpi) {
+  int brf_src_index = 0;
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+
+  // TODO(zoeliu): We need to add the check on the -bwd_ref command line setup
+  //               flag.
+  if (gf_group->bidir_pred_enabled[gf_group->index]) {
+    if (cpi->oxcf.pass == 2) {
+      if (gf_group->update_type[gf_group->index] == BRF_UPDATE)
+        brf_src_index = gf_group->brf_src_offset[gf_group->index];
+    } else {
+      // TODO(zoeliu): To re-visit the setup for this scenario
+      brf_src_index = cpi->rc.bipred_group_interval - 1;
+    }
+  }
+
+  return brf_src_index;
+}
+
+// Returns 0 if this is not an alt ref else the offset of the source frame
+// used as the arf midpoint.
+static int get_arf2_src_index(AV1_COMP *cpi) {
+  int arf2_src_index = 0;
+  if (is_altref_enabled(cpi) && cpi->num_extra_arfs) {
+    if (cpi->oxcf.pass == 2) {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
+        arf2_src_index = gf_group->arf_src_offset[gf_group->index];
+      }
+    }
+  }
+  return arf2_src_index;
+}
+
+// Called if this frame is an ARF or ARF2. Also handles forward-keyframes
+// For an ARF set arf2=0, for ARF2 set arf2=1
+// temporal_filtered is set to 1 if we temporally filter the ARF frame, so that
+// the correct post-filter buffer can be used.
+static struct lookahead_entry *setup_arf_or_arf2(
+    AV1_COMP *const cpi, const int arf_src_index, const int arf2,
+    int *temporal_filtered, EncodeFrameParams *const frame_params) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+  assert(arf_src_index <= rc->frames_to_key);
+  *temporal_filtered = 0;
+
+  struct lookahead_entry *source =
+      av1_lookahead_peek(cpi->lookahead, arf_src_index);
+
+  if (source != NULL) {
+    cm->showable_frame = 1;
+    cpi->alt_ref_source = source;
+
+    // When arf_src_index == rc->frames_to_key, it indicates a fwd_kf
+    if (!arf2 && arf_src_index == rc->frames_to_key) {
+      // Skip temporal filtering and mark as intra_only if we have a fwd_kf
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      int which_arf = gf_group->arf_update_idx[gf_group->index];
+      cpi->is_arf_filter_off[which_arf] = 1;
+      cpi->no_show_kf = 1;
+    } else {
+      if (oxcf->arnr_max_frames > 0) {
+        // Produce the filtered ARF frame.
+        av1_temporal_filter(cpi, arf_src_index);
+        aom_extend_frame_borders(&cpi->alt_ref_buffer, av1_num_planes(cm));
+        *temporal_filtered = 1;
+      }
+    }
+    frame_params->show_frame = 0;
+
+    if (oxcf->pass < 2) {
+      // In second pass, the buffer updates configure will be set
+      // in the function av1_rc_get_second_pass_params
+      av1_configure_buffer_updates(cpi, arf2 ? INTNL_ARF_UPDATE : ARF_UPDATE);
+    }
+  }
+  rc->source_alt_ref_pending = 0;
+  return source;
+}
+
+// Determine whether there is a forced keyframe pending in the lookahead buffer
+static int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
+                                      const int up_to_index) {
+  for (int i = 0; i <= up_to_index; i++) {
+    const struct lookahead_entry *e = av1_lookahead_peek(lookahead, i);
+    if (e == NULL) {
+      // We have reached the end of the lookahead buffer and not early-returned
+      // so there isn't a forced key-frame pending.
+      return 0;
+    } else if (e->flags == AOM_EFLAG_FORCE_KF) {
+      return 1;
+    } else {
+      continue;
+    }
+  }
+  return 0;  // Never reached
+}
+
+// Check if we should encode an ARF, ARF2 or BRF.  If not, try a LAST
+// Do some setup associated with the chosen source
+// Return the frame source, or NULL if we couldn't find one
+struct lookahead_entry *choose_frame_source(
+    AV1_COMP *const cpi, int *const temporal_filtered, int *const flush,
+    struct lookahead_entry **last_source,
+    EncodeFrameParams *const frame_params) {
+  AV1_COMMON *const cm = &cpi->common;
+  struct lookahead_entry *source = NULL;
+  *temporal_filtered = 0;
+
+  // Should we encode an alt-ref frame.
+  int arf_src_index = get_arf_src_index(cpi);
+  if (arf_src_index &&
+      is_forced_keyframe_pending(cpi->lookahead, arf_src_index)) {
+    arf_src_index = 0;
+    *flush = 1;
+  }
+
+  if (arf_src_index) {
+    source = setup_arf_or_arf2(cpi, arf_src_index, 0, temporal_filtered,
+                               frame_params);
+  }
+
+  // Should we encode an arf2 frame (mutually exclusive to ARF)
+  arf_src_index = get_arf2_src_index(cpi);
+  if (arf_src_index &&
+      is_forced_keyframe_pending(cpi->lookahead, arf_src_index)) {
+    arf_src_index = 0;
+    *flush = 1;
+  }
+
+  if (arf_src_index) {
+    source = setup_arf_or_arf2(cpi, arf_src_index, 1, temporal_filtered,
+                               frame_params);
+  }
+
+  cpi->rc.is_bwd_ref_frame = 0;
+  int brf_src_index = get_brf_src_index(cpi);
+  if (brf_src_index) {
+    assert(brf_src_index <= cpi->rc.frames_to_key);
+    if ((source = av1_lookahead_peek(cpi->lookahead, brf_src_index)) != NULL) {
+      cm->showable_frame = 1;
+      frame_params->show_frame = 0;
+
+      if (cpi->oxcf.pass < 2) {
+        // In second pass, the buffer updates configure will be set
+        // in the function av1_rc_get_second_pass_params
+        av1_configure_buffer_updates(cpi, BRF_UPDATE);
+      }
+    }
+  }
+
+  if (!source) {
+    // Get last frame source.
+    if (cm->current_frame.frame_number > 0) {
+      *last_source = av1_lookahead_peek(cpi->lookahead, -1);
+    }
+    // Read in the source frame.
+    source = av1_lookahead_pop(cpi->lookahead, *flush);
+
+    if (source != NULL) {
+      frame_params->show_frame = 1;
+
+      // Check to see if the frame should be encoded as an arf overlay.
+      check_src_altref(cpi, source);
+    }
+  }
+  return source;
+}
+
 int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
-                        uint8_t *const dest, unsigned int *frame_flags) {
-  EncodeFrameParams frame_params = { 0, 0, 0 };
-  EncodeFrameResults frame_results = { 0 };
+                        uint8_t *const dest, unsigned int *frame_flags,
+                        int64_t *const time_stamp, int64_t *const time_end,
+                        const aom_rational_t *const timebase, int flush) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  AV1_COMMON *const cm = &cpi->common;
+
+  EncodeFrameInput frame_input;
+  EncodeFrameParams frame_params;
+  EncodeFrameResults frame_results;
+  memset(&frame_input, 0, sizeof(frame_input));
+  memset(&frame_params, 0, sizeof(frame_params));
+  memset(&frame_results, 0, sizeof(frame_results));
+
+  int temporal_filtered = 0;
+  struct lookahead_entry *source = NULL;
+  struct lookahead_entry *last_source = NULL;
+  if (cm->show_existing_frame) {
+    source = av1_lookahead_pop(cpi->lookahead, flush);
+  } else {
+    source = choose_frame_source(cpi, &temporal_filtered, &flush, &last_source,
+                                 &frame_params);
+  }
+
+  if (source == NULL) {  // If no source was found, we can't encode a frame.
+    if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
+      av1_end_first_pass(cpi); /* get last stats packet */
+      cpi->twopass.first_pass_done = 1;
+    }
+    return -1;
+  }
+
+  frame_input.source = temporal_filtered ? &cpi->alt_ref_buffer : &source->img;
+  frame_input.last_source = last_source != NULL ? &last_source->img : NULL;
+  frame_input.ts_duration = source->ts_end - source->ts_start;
+
+  *time_stamp = source->ts_start;
+  *time_end = source->ts_end;
+  if (source->ts_start < cpi->first_time_stamp_ever) {
+    cpi->first_time_stamp_ever = source->ts_start;
+    cpi->last_end_time_stamp_seen = source->ts_start;
+  }
+
+  av1_apply_encoding_flags(cpi, source->flags);
+  if (!cm->show_existing_frame)
+    *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+  cpi->frame_flags = *frame_flags;
+
+  if (frame_params.show_frame ||
+      (cm->show_existing_frame && cpi->rc.is_src_frame_alt_ref)) {
+    // Shown frames and arf-overlay frames need frame-rate considering
+    adjust_frame_rate(cpi, source);
+  }
+
+  if (cm->show_existing_frame) {
+    // show_existing_frame implies this frame is shown!
+    frame_params.show_frame = 1;
+  } else {
+    // Retain the RF_LEVEL for the current newly coded frame.
+    cm->cur_frame->frame_rf_level =
+        cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+
+    if (cpi->film_grain_table) {
+      cm->seq_params.film_grain_params_present = aom_film_grain_table_lookup(
+          cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */,
+          &cm->film_grain_params);
+    }
+    cm->cur_frame->film_grain_params_present =
+        cm->seq_params.film_grain_params_present;
+
+    // only one operating point supported now
+    const int64_t pts64 = ticks_to_timebase_units(timebase, *time_stamp);
+    if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR;
+    cpi->common.frame_presentation_time = (uint32_t)pts64;
+  }
+
+  if (oxcf->pass == 2 &&
+      (!cm->show_existing_frame || cpi->rc.is_src_frame_alt_ref)) {
+    // GF_GROUP needs updating for arf overlays as well as non-show-existing
+    av1_rc_get_second_pass_params(cpi, &frame_params);
+  }
+  if (cm->show_existing_frame && frame_params.frame_type != KEY_FRAME) {
+    // Force show-existing frames to be INTER, except forward keyframes
+    frame_params.frame_type = INTER_FRAME;
+  }
+
+  if (!cm->show_existing_frame) {
+    cm->using_qmatrix = cpi->oxcf.using_qm;
+    cm->min_qmlevel = cpi->oxcf.qm_minlevel;
+    cm->max_qmlevel = cpi->oxcf.qm_maxlevel;
+    if (cpi->twopass.gf_group.index == 1 && cpi->oxcf.enable_tpl_model) {
+      av1_set_frame_size(cpi, cm->width, cm->height);
+      av1_tpl_setup_stats(cpi, &frame_input);
+    }
+  }

  frame_params.frame_flags = frame_flags;

@@ -143,17 +837,82 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
  // TODO(david.turner@argondesign.com): Change all the encode strategy to
  // modify frame_params instead of cm or cpi.

-  // Apply external override flags
-  set_ext_overrides(cpi, &frame_params);
+  // Per-frame encode speed.  In theory this can vary, but things may have been
+  // written assuming speed-level will not change within a sequence, so this
+  // parameter should be used with caution.
+  frame_params.speed = oxcf->speed;

-  // Work out which reference frame slots may be used.
-  frame_params.ref_frame_flags = get_ref_frame_flags(cpi);
+  // Work out some encoding parameters specific to the pass:
+  if (oxcf->pass == 0) {
+    if (cpi->oxcf.rc_mode == AOM_CBR) {
+      av1_rc_get_one_pass_cbr_params(cpi, &frame_params);
+    } else {
+      av1_rc_get_one_pass_vbr_params(cpi, &frame_params);
+    }
+  } else if (oxcf->pass == 1) {
+    av1_setup_frame_size(cpi);
+    cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(&cpi->oxcf);
+    if (!cpi->refresh_alt_ref_frame && (cm->current_frame.frame_number == 0 ||
+                                        (cpi->frame_flags & FRAMEFLAGS_KEY))) {
+      frame_params.frame_type = KEY_FRAME;
+    } else {
+      frame_params.frame_type = INTER_FRAME;
+    }
+  } else if (oxcf->pass == 2) {
+#if CONFIG_MISMATCH_DEBUG
+    mismatch_move_frame_idx_w();
+#endif
+#if TXCOEFF_COST_TIMER
+    cm->txcoeff_cost_timer = 0;
+    cm->txcoeff_cost_count = 0;
+#endif
+  }

-  if (av1_encode(cpi, dest, &frame_params, &frame_results) != AOM_CODEC_OK) {
+  if (oxcf->pass == 0 || oxcf->pass == 2) {
+    // Apply external override flags
+    set_ext_overrides(cpi, &frame_params);
+
+    // Work out which reference frame slots may be used.
+    frame_params.ref_frame_flags = get_ref_frame_flags(cpi);
+  }
+
+  if (oxcf->pass == 0 || oxcf->pass == 2) {
+    frame_params.primary_ref_frame =
+        choose_primary_ref_frame(cpi, &frame_params);
+    frame_params.order_offset = get_order_offset(cpi, &frame_params);
+  }
+
+  if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) !=
+      AOM_CODEC_OK) {
    return AOM_CODEC_ERROR;
  }

+  if (oxcf->pass == 2) {
+#if TXCOEFF_COST_TIMER
+    cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer;
+    fprintf(stderr,
+            "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld "
+            "in us\n",
+            cm->txcoeff_cost_count, cm->txcoeff_cost_timer,
+            cm->cum_txcoeff_cost_timer);
+#endif
+    av1_twopass_postencode_update(cpi);
+  }
+
+  if (oxcf->pass == 0 || oxcf->pass == 2) {
+    update_fb_of_context_type(cpi, &frame_params, cpi->fb_of_context_type);
+    set_additional_frame_flags(cm, frame_params.frame_flags);
+    update_rc_counts(cpi);
+    check_show_existing_frame(cpi);  // Is next frame a show_existing frame?
+  }
+
+  // Unpack frame_results:
  *size = frame_results.size;

+  // Leave a signal for a higher level caller about if this frame is droppable
+  if (*size > 0) {
+    cpi->droppable = is_frame_droppable(cpi);
+  }
+
  return AOM_CODEC_OK;
 }
@@ -16,11 +16,24 @@
 extern "C" {
 #endif

+#include <stdint.h>
+
+#include "aom/aom_encoder.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+
 // This function will implement high-level encode strategy, choosing frame type,
 // frame placement, etc.  It populates an EncodeFrameParams struct with the
 // results of these decisions and then calls av1_encode()
 int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
-                        uint8_t *const dest, unsigned int *frame_flags);
+                        uint8_t *const dest, unsigned int *frame_flags,
+                        int64_t *const time_stamp, int64_t *const time_end,
+                        const aom_rational_t *const timebase, int flush);
+
+// Set individual buffer update flags based on frame reference type
+void av1_configure_buffer_updates(AV1_COMP *const cpi,
+                                  const FRAME_UPDATE_TYPE type);

 #ifdef __cplusplus
 }  // extern "C"
@@ -600,7 +600,7 @@ static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
    return;
  }

-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
    x->source_variance = av1_high_get_sby_perpixel_variance(
        cpi, &x->plane[0].src, bsize, xd->bd);
  } else {
@@ -613,8 +613,7 @@ static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
    x->edge_strength = UINT16_MAX;
  } else {
    x->edge_strength =
-        edge_strength(&x->plane[0].src, bsize,
-                      xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd);
+        edge_strength(&x->plane[0].src, bsize, is_cur_buf_hbd(xd), xd->bd);
  }
  // Save rdmult before it might be changed, so it can be restored later.
  orig_rdmult = x->rdmult;
@@ -2180,7 +2179,8 @@ static void simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,

  // Get a copy of the prediction output
  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-  av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                AOM_PLANE_Y, AOM_PLANE_Y);

  aom_clear_system_state();

@@ -2787,77 +2787,6 @@ static void rd_pick_sqr_partition(AV1_COMP *const cpi, ThreadData *td,
  }
 }

-#define FEATURE_SIZE 19
-static const float two_pass_split_partition_weights_128[FEATURE_SIZE + 1] = {
-  2.683936f, -0.193620f, -4.106470f, -0.141320f, -0.282289f,
-  0.125296f, -1.134961f, 0.862757f,  -0.418799f, -0.637666f,
-  0.016232f, 0.345013f,  0.018823f,  -0.393394f, -1.130700f,
-  0.695357f, 0.112569f,  -0.341975f, -0.513882f, 5.7488966f,
-};
-
-static const float two_pass_split_partition_weights_64[FEATURE_SIZE + 1] = {
-  2.990993f,  0.423273f,  -0.926544f, 0.454646f,  -0.292698f,
-  -1.311632f, -0.284432f, 0.717141f,  -0.419257f, -0.574760f,
-  -0.674444f, 0.669047f,  -0.374255f, 0.380624f,  -0.804036f,
-  0.264021f,  0.004163f,  1.896802f,  0.924287f,  0.13490619f,
-};
-
-static const float two_pass_split_partition_weights_32[FEATURE_SIZE + 1] = {
-  2.795181f,  -0.136943f, -0.924842f, 0.405330f,  -0.463505f,
-  -0.584076f, -0.831472f, 0.382985f,  -0.597544f, -0.138915f,
-  -1.354350f, 0.466035f,  -0.553961f, 0.213202f,  -1.166429f,
-  0.010776f,  -0.096236f, 2.335084f,  1.699857f,  -0.58178353f,
-};
-
-static const float two_pass_split_partition_weights_16[FEATURE_SIZE + 1] = {
-  1.987888f,  -0.431100f, -1.687703f, 0.262602f,  -0.425298f,
-  -0.463870f, -1.493457f, 0.470917f,  -0.528457f, -0.087700f,
-  -1.815092f, 0.152883f,  -0.337908f, 0.093679f,  -1.548267f,
-  -0.042387f, -0.000861f, 2.556746f,  1.619192f,  0.03643292f,
-};
-
-static const float two_pass_split_partition_weights_8[FEATURE_SIZE + 1] = {
-  2.188344f,  -0.817528f, -2.119219f, 0.000000f,  -0.348167f,
-  -0.658074f, -1.960362f, 0.000000f,  -0.403080f, 0.282699f,
-  -2.061088f, 0.000000f,  -0.431919f, -0.127960f, -1.099550f,
-  0.000000f,  0.121622f,  2.017455f,  2.058228f,  -0.15475988f,
-};
-
-static const float two_pass_none_partition_weights_128[FEATURE_SIZE + 1] = {
-  -1.006689f, 0.777908f,  4.461072f,  -0.395782f, -0.014610f,
-  -0.853863f, 0.729997f,  -0.420477f, 0.282429f,  -1.194595f,
-  3.181220f,  -0.511416f, 0.117084f,  -1.149348f, 1.507990f,
-  -0.477212f, 0.202963f,  -1.469581f, 0.624461f,  -0.89081228f,
-};
-
-static const float two_pass_none_partition_weights_64[FEATURE_SIZE + 1] = {
-  -1.241117f, 0.844878f,  5.638803f,  -0.489780f, -0.108796f,
-  -4.576821f, 1.540624f,  -0.477519f, 0.227791f,  -1.443968f,
-  1.586911f,  -0.505125f, 0.140764f,  -0.464194f, 1.466658f,
-  -0.641166f, 0.195412f,  1.427905f,  2.080007f,  -1.98272777f,
-};
-
-static const float two_pass_none_partition_weights_32[FEATURE_SIZE + 1] = {
-  -2.130825f, 0.476023f,  5.907343f,  -0.516002f, -0.097471f,
-  -2.662754f, 0.614858f,  -0.576728f, 0.085261f,  -0.031901f,
-  0.727842f,  -0.600034f, 0.079326f,  0.324328f,  0.504502f,
-  -0.547105f, -0.037670f, 0.304995f,  0.369018f,  -2.66299987f,
-};
-
-static const float two_pass_none_partition_weights_16[FEATURE_SIZE + 1] = {
-  -1.626410f, 0.872047f,  5.414965f,  -0.554781f, -0.084514f,
-  -3.020550f, 0.467632f,  -0.382280f, 0.199568f,  0.426220f,
-  0.829426f,  -0.467100f, 0.153098f,  0.662994f,  0.327545f,
-  -0.560106f, -0.141610f, 0.403372f,  0.523991f,  -3.02891231f,
-};
-
-static const float two_pass_none_partition_weights_8[FEATURE_SIZE + 1] = {
-  -1.463349f, 0.375376f,  4.751430f, 0.000000f, -0.184451f,
-  -1.655447f, 0.443214f,  0.000000f, 0.127961f, 0.152435f,
-  0.083288f,  0.000000f,  0.143105f, 0.438012f, 0.073238f,
-  0.000000f,  -0.278137f, 0.186134f, 0.073737f, -1.6494962f,
-};
-
 // split_score indicates confidence of picking split partition;
 // none_score indicates confidence of picking none partition;
 static int ml_prune_2pass_split_partition(const PC_TREE_STATS *pc_tree_stats,
@@ -2980,7 +2909,7 @@ static void ml_prune_rect_partition(const AV1_COMP *const cpi,
  // Variance ratios
  const MACROBLOCKD *const xd = &x->e_mbd;
  int whole_block_variance;
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
    whole_block_variance = av1_high_get_sby_perpixel_variance(
        cpi, &x->plane[0].src, bsize, xd->bd);
  } else {
@@ -2998,7 +2927,7 @@ static void ml_prune_rect_partition(const AV1_COMP *const cpi,
    const int x_idx = (i & 1) * bw / 2;
    const int y_idx = (i >> 1) * bw / 2;
    buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride;
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (is_cur_buf_hbd(xd)) {
      split_variance[i] =
          av1_high_get_sby_perpixel_variance(cpi, &buf, subsize, xd->bd);
    } else {
@@ -3180,7 +3109,7 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
          src + i * block_size_high[horz_4_bs] * src_stride;
      const uint8_t *vert_src = src + i * block_size_wide[vert_4_bs];
      unsigned int horz_var, vert_var, sse;
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (is_cur_buf_hbd(xd)) {
        switch (xd->bd) {
          case 10:
            horz_var = cpi->fn_ptr[horz_4_bs].vf(
@@ -3898,6 +3827,13 @@ static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,

  (void)*tp_orig;

+#if CONFIG_COLLECT_PARTITION_STATS
+  PartitionStats *part_stats = &cpi->partition_stats;
+  const int bsize_idx = av1_get_bsize_idx_for_part_stats(bsize);
+  int *partition_decisions = part_stats->partition_decisions[bsize_idx];
+  int *partition_attempts = part_stats->partition_attempts[bsize_idx];
+#endif
+
  // Override partition costs at the edges of the frame in the same
  // way as in read_partition (see decodeframe.c)
  if (!(has_rows && has_cols)) {
@@ -4154,6 +4090,11 @@ BEGIN_PARTITION_SEARCH:
    const int64_t best_remain_rdcost =
        (best_rdc.rdcost == INT64_MAX) ? INT64_MAX
                                       : (best_rdc.rdcost - partition_rd_cost);
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (!frame_is_intra_only(cm) && best_remain_rdcost >= 0) {
+      partition_attempts[PARTITION_NONE] += 1;
+    }
+#endif
    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
                     PARTITION_NONE, bsize, ctx_none, best_remain_rdcost);
    pb_source_variance = x->source_variance;
@@ -4291,6 +4232,11 @@ BEGIN_PARTITION_SEARCH:
    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);

    int idx;
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (!frame_is_intra_only(cm) && best_rdc.rdcost - sum_rdc.rdcost >= 0) {
+      partition_attempts[PARTITION_SPLIT] += 1;
+    }
+#endif
    for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) {
      const int x_idx = (idx & 1) * mi_step;
      const int y_idx = (idx >> 1) * mi_step;
@@ -4469,11 +4415,16 @@ BEGIN_PARTITION_SEARCH:
      pc_tree->horizontal[0].pred_interp_filter =
          av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
    }
+    sum_rdc.rate = partition_cost[PARTITION_HORZ];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
    const int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
                                           ? INT64_MAX
                                           : (best_rdc.rdcost - sum_rdc.rdcost);
-    sum_rdc.rate = partition_cost[PARTITION_HORZ];
-    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (!frame_is_intra_only(cm) && best_remain_rdcost >= 0) {
+      partition_attempts[PARTITION_HORZ] += 1;
+    }
+#endif
    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
                     PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
                     best_remain_rdcost);
@@ -4551,6 +4502,11 @@ BEGIN_PARTITION_SEARCH:
    const int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
                                           ? INT64_MAX
                                           : (best_rdc.rdcost - sum_rdc.rdcost);
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (!frame_is_intra_only(cm) && best_remain_rdcost >= 0) {
+      partition_attempts[PARTITION_VERT] += 1;
+    }
+#endif
    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
                     PARTITION_VERT, subsize, &pc_tree->vertical[0],
                     best_remain_rdcost);
@@ -4609,7 +4565,7 @@ BEGIN_PARTITION_SEARCH:

  if (pb_source_variance == UINT_MAX) {
    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (is_cur_buf_hbd(xd)) {
      pb_source_variance = av1_high_get_sby_perpixel_variance(
          cpi, &x->plane[0].src, bsize, xd->bd);
    } else {
@@ -4770,6 +4726,18 @@ BEGIN_PARTITION_SEARCH:
        pc_tree->horizontala[2].ref_selected[0] = split_mbmi[2]->ref_frame[0];
      }
    }
+#if CONFIG_COLLECT_PARTITION_STATS
+    {
+      RD_STATS tmp_sum_rdc;
+      av1_init_rd_stats(&tmp_sum_rdc);
+      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_HORZ_A];
+      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+      if (!frame_is_intra_only(cm) &&
+          best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
+        partition_attempts[PARTITION_HORZ_A] += 1;
+      }
+    }
+#endif
    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                       pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize,
                       PARTITION_HORZ_A, mi_row, mi_col, bsize2, mi_row,
@@ -4829,6 +4797,18 @@ BEGIN_PARTITION_SEARCH:
        pc_tree->horizontalb[2].ref_selected[0] = split_mbmi[3]->ref_frame[0];
      }
    }
+#if CONFIG_COLLECT_PARTITION_STATS
+    {
+      RD_STATS tmp_sum_rdc;
+      av1_init_rd_stats(&tmp_sum_rdc);
+      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_HORZ_B];
+      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+      if (!frame_is_intra_only(cm) &&
+          best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
+        partition_attempts[PARTITION_HORZ_B] += 1;
+      }
+    }
+#endif
    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                       pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize,
                       PARTITION_HORZ_B, mi_row, mi_col, subsize,
@@ -4886,6 +4866,18 @@ BEGIN_PARTITION_SEARCH:
        pc_tree->verticala[2].ref_selected[0] = split_mbmi[1]->ref_frame[0];
      }
    }
+#if CONFIG_COLLECT_PARTITION_STATS
+    {
+      RD_STATS tmp_sum_rdc;
+      av1_init_rd_stats(&tmp_sum_rdc);
+      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_VERT_A];
+      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+      if (!frame_is_intra_only(cm) &&
+          best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
+        partition_attempts[PARTITION_VERT_A] += 1;
+      }
+    }
+#endif
    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                       pc_tree->verticala, ctx_none, mi_row, mi_col, bsize,
                       PARTITION_VERT_A, mi_row, mi_col, bsize2,
@@ -4942,6 +4934,18 @@ BEGIN_PARTITION_SEARCH:
        pc_tree->verticalb[2].ref_selected[0] = split_mbmi[3]->ref_frame[0];
      }
    }
+#if CONFIG_COLLECT_PARTITION_STATS
+    {
+      RD_STATS tmp_sum_rdc;
+      av1_init_rd_stats(&tmp_sum_rdc);
+      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_VERT_B];
+      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+      if (!frame_is_intra_only(cm) &&
+          best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
+        partition_attempts[PARTITION_VERT_B] += 1;
+      }
+    }
+#endif
    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                       pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize,
                       PARTITION_VERT_B, mi_row, mi_col, subsize, mi_row,
@@ -5000,6 +5004,11 @@ BEGIN_PARTITION_SEARCH:
    sum_rdc.rate = partition_cost[PARTITION_HORZ_4];
    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);

+#if CONFIG_COLLECT_PARTITION_STATS
+    if (!frame_is_intra_only(cm) && best_rdc.rdcost - sum_rdc.rdcost >= 0) {
+      partition_attempts[PARTITION_HORZ_4] += 1;
+    }
+#endif
    for (int i = 0; i < 4; ++i) {
      const int this_mi_row = mi_row + i * quarter_step;

@@ -5046,6 +5055,11 @@ BEGIN_PARTITION_SEARCH:
    sum_rdc.rate = partition_cost[PARTITION_VERT_4];
    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);

+#if CONFIG_COLLECT_PARTITION_STATS
+    if (!frame_is_intra_only(cm) && best_rdc.rdcost - sum_rdc.rdcost >= 0) {
+      partition_attempts[PARTITION_VERT_4] += 1;
+    }
+#endif
    for (int i = 0; i < 4; ++i) {
      const int this_mi_col = mi_col + i * quarter_step;

@@ -5083,6 +5097,11 @@ BEGIN_PARTITION_SEARCH:
    // Did not find a valid partition, go back and search again, with less
    // constraint on which partition types to search.
    x->must_find_valid_partition = 1;
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (!frame_is_intra_only(cm)) {
+      part_stats->partition_redo += 1;
+    }
+#endif
    goto BEGIN_PARTITION_SEARCH;
  }

@@ -5093,6 +5112,13 @@ BEGIN_PARTITION_SEARCH:
  (void)best_rd;
  *rd_cost = best_rdc;

+#if CONFIG_COLLECT_PARTITION_STATS
+  if (!frame_is_intra_only(cm) && best_rdc.rate < INT_MAX &&
+      best_rdc.dist < INT64_MAX) {
+    partition_decisions[pc_tree->partitioning] += 1;
+  }
+#endif
+
  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
      pc_tree->index != 3) {
    if (bsize == cm->seq_params.sb_size) {
@@ -5643,13 +5669,11 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
                        sb_size, BLOCK_4X4, &dummy_rdc, INT64_MAX, pc_root,
                        NULL);
    }
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
    // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile.
    if (cpi->sf.inter_mode_rd_model_estimation == 1 && cm->tile_cols == 1 &&
        cm->tile_rows == 1) {
      av1_inter_mode_data_fit(tile_data, x->rdmult);
    }
-#endif
    if (tile_data->allow_update_cdf && (cpi->row_mt == 1) &&
        (tile_info->mi_row_end > (mi_row + mib_size))) {
      if (sb_cols_in_tile == 1)
@@ -5805,9 +5829,7 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
  const TileInfo *const tile_info = &this_tile->tile_info;
  int mi_row;

-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
  av1_inter_mode_data_init(this_tile);
-#endif

  av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start,
                         tile_info->mi_col_end, tile_row);
@@ -6350,11 +6372,10 @@ static void encode_frame_internal(AV1_COMP *cpi) {
                 do_gm_search_logic(&cpi->sf, num_refs_using_gm, frame) &&
                 !(cpi->sf.selective_ref_gm && skip_gm_frame(cm, frame))) {
        TransformationType model;
-        const int64_t ref_frame_error =
-            av1_frame_error(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
-                            ref_buf[frame]->y_buffer, ref_buf[frame]->y_stride,
-                            cpi->source->y_buffer, cpi->source->y_width,
-                            cpi->source->y_height, cpi->source->y_stride);
+        const int64_t ref_frame_error = av1_frame_error(
+            is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer,
+            ref_buf[frame]->y_stride, cpi->source->y_buffer,
+            cpi->source->y_width, cpi->source->y_height, cpi->source->y_stride);

        if (ref_frame_error == 0) continue;

@@ -6380,9 +6401,8 @@ static void encode_frame_internal(AV1_COMP *cpi) {

            if (tmp_wm_params.wmtype != IDENTITY) {
              const int64_t warp_error = av1_refine_integerized_param(
-                  &tmp_wm_params, tmp_wm_params.wmtype,
-                  xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
-                  ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
+                  &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd),
+                  xd->bd, ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
                  ref_buf[frame]->y_height, ref_buf[frame]->y_stride,
                  cpi->source->y_buffer, cpi->source->y_width,
                  cpi->source->y_height, cpi->source->y_stride, 5,
@@ -6491,20 +6511,6 @@ void av1_encode_frame(AV1_COMP *cpi) {
  // rather than the potential full set of 16 transforms
  cm->reduced_tx_set_used = cpi->oxcf.reduced_tx_type_set;

-  if (cm->show_frame == 0) {
-    int arf_offset = AOMMIN(
-        (MAX_GF_INTERVAL - 1),
-        cpi->twopass.gf_group.arf_src_offset[cpi->twopass.gf_group.index]);
-    int brf_offset =
-        cpi->twopass.gf_group.brf_src_offset[cpi->twopass.gf_group.index];
-    arf_offset = AOMMIN((MAX_GF_INTERVAL - 1), arf_offset + brf_offset);
-    current_frame->order_hint = current_frame->frame_number + arf_offset;
-  } else {
-    current_frame->order_hint = current_frame->frame_number;
-  }
-  current_frame->order_hint %=
-      (1 << (cm->seq_params.order_hint_info.order_hint_bits_minus_1 + 1));
-
  // Make sure segment_id is no larger than last_active_segid.
  if (cm->seg.enabled && cm->seg.update_map) {
    const int mi_rows = cm->mi_rows;
@@ -6520,7 +6526,9 @@ void av1_encode_frame(AV1_COMP *cpi) {
  }

  av1_setup_frame_buf_refs(cm);
-  if (cpi->sf.selective_ref_frame >= 3) enforce_max_ref_frames(cpi);
+  if (cpi->sf.selective_ref_frame >= 3 && cpi->oxcf.max_reference_frames == 7) {
+    enforce_max_ref_frames(cpi);
+  }
  av1_setup_frame_sign_bias(cm);

 #if CONFIG_MISMATCH_DEBUG
@@ -6830,7 +6838,8 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
                           xd->block_ref_scale_factors[ref], num_planes);
    }

-    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                  av1_num_planes(cm) - 1);
    if (mbmi->motion_mode == OBMC_CAUSAL) {
      assert(cpi->oxcf.enable_obmc == 1);
      av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
@@ -43,7 +43,7 @@ static void subtract_block(const MACROBLOCKD *xd, int rows, int cols,
                           const uint8_t *src8, ptrdiff_t src_stride,
                           const uint8_t *pred8, ptrdiff_t pred_stride) {
  if (check_subtract_block_size(rows, cols)) {
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (is_cur_buf_hbd(xd)) {
      aom_highbd_subtract_block_c(rows, cols, diff, diff_stride, src8,
                                  src_stride, pred8, pred_stride, xd->bd);
      return;
@@ -54,7 +54,7 @@ static void subtract_block(const MACROBLOCKD *xd, int rows, int cols,
    return;
  }

-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
    aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
                              pred8, pred_stride, xd->bd);
    return;
@@ -163,6 +163,7 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
  qparam.tx_size = tx_size;
  qparam.qmatrix = qmatrix;
  qparam.iqmatrix = iqmatrix;
+  qparam.use_quant_b_adapt = cm->use_quant_b_adapt;
  TxfmParam txfm_param;
  txfm_param.tx_type = tx_type;
  txfm_param.tx_size = tx_size;
@@ -171,7 +172,7 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
      txfm_param.tx_size, is_inter_block(mbmi), cm->reduced_tx_set_used);

  txfm_param.bd = xd->bd;
-  txfm_param.is_hbd = get_bitdepth_data_path_index(xd);
+  txfm_param.is_hbd = is_cur_buf_hbd(xd);

  av1_fwd_txfm(src_diff, coeff, diff_stride, &txfm_param);

@@ -431,7 +432,7 @@ static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,

  if (p->eobs[block] > 0) {
    txfm_param.bd = xd->bd;
-    txfm_param.is_hbd = get_bitdepth_data_path_index(xd);
+    txfm_param.is_hbd = is_cur_buf_hbd(xd);
    txfm_param.tx_type = DCT_DCT;
    txfm_param.tx_size = tx_size;
    txfm_param.eob = p->eobs[block];
@@ -329,6 +329,7 @@ typedef struct AV1EncoderConfig {
  int enable_order_hint;
  int enable_dist_wtd_comp;
  int enable_ref_frame_mvs;
+  unsigned int max_reference_frames;
  unsigned int allow_ref_frame_mvs;
  int enable_masked_comp;
  int enable_interintra_comp;
@@ -357,6 +358,9 @@ typedef struct AV1EncoderConfig {
  unsigned int chroma_subsampling_x;
  unsigned int chroma_subsampling_y;
  int reduced_tx_type_set;
+  int use_intra_dct_only;
+  int use_inter_dct_only;
+  int quant_b_adapt;
  int border_in_pixels;
 } AV1EncoderConfig;

@@ -450,7 +454,6 @@ typedef struct FRAME_COUNTS {
                                [SWITCHABLE_FILTERS];
 } FRAME_COUNTS;

-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
 #define INTER_MODE_RD_DATA_OVERALL_SIZE 6400

 typedef struct {
@@ -485,7 +488,6 @@ typedef struct inter_modes_info {
  int64_t est_rd_arr[MAX_INTER_MODES];
  RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES];
 } InterModesInfo;
-#endif

 // Encoder row synchronization
 typedef struct AV1RowMTSyncData {
@@ -514,9 +516,7 @@ typedef struct TileDataEnc {
  DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
  FRAME_CONTEXT *row_ctx;
  uint8_t allow_update_cdf;
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
  InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
-#endif
  AV1RowMTSync row_mt_sync;
  AV1RowMTInfo row_mt_info;
 } TileDataEnc;
@@ -551,9 +551,7 @@ typedef struct ThreadData {
  tran_low_t *tree_coeff_buf[MAX_MB_PLANE];
  tran_low_t *tree_qcoeff_buf[MAX_MB_PLANE];
  tran_low_t *tree_dqcoeff_buf[MAX_MB_PLANE];
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
  InterModesInfo *inter_modes_info;
-#endif
  uint32_t *hash_value_buffer[2][2];
  int32_t *wsrc_buf;
  int32_t *mask_buf;
@@ -595,6 +593,15 @@ typedef struct {
  YV12_BUFFER_CONFIG buf;
 } EncRefCntBuffer;

+#if CONFIG_COLLECT_PARTITION_STATS
+typedef struct PartitionStats {
+  int partition_decisions[6][EXT_PARTITION_TYPES];
+  int partition_attempts[6][EXT_PARTITION_TYPES];
+
+  int partition_redo;
+} PartitionStats;
+#endif
+
 typedef struct AV1_COMP {
  QUANTS quants;
  ThreadData td;
@@ -658,14 +665,12 @@ typedef struct AV1_COMP {
  // frame of the same type as the current frame).
  int fb_of_context_type[REF_FRAMES];

-#if USE_SYMM_MULTI_LAYER
  // When true, a new rule for backward (future) reference frames is in effect:
  // - BWDREF_FRAME is always the closest future frame available
  // - ALTREF2_FRAME is always the 2nd closest future frame available
  // - 'refresh_bwd_ref_frame' flag is used for updating both the BWDREF_FRAME
  // and ALTREF2_FRAME. ('refresh_alt2_ref_frame' flag is irrelevant).
  int new_bwdref_update_rule;
-#endif

  int ext_refresh_frame_flags_pending;
  int ext_refresh_last_frame;
@@ -718,6 +723,9 @@ typedef struct AV1_COMP {
  int ref_frame_flags;
  int ext_ref_frame_flags;

+  // speed is passed as a per-frame parameter into the encoder
+  int speed;
+  // sf contains fine-grained config set internally based on speed
  SPEED_FEATURES sf;

  unsigned int max_mv_magnitude;
@@ -865,18 +873,35 @@ typedef struct AV1_COMP {
 #endif
  // Set if screen content is set or relevant tools are enabled
  int is_screen_content_type;
+#if CONFIG_COLLECT_PARTITION_STATS
+  PartitionStats partition_stats;
+#endif
 } AV1_COMP;

+typedef struct {
+  YV12_BUFFER_CONFIG *source;
+  YV12_BUFFER_CONFIG *last_source;
+  int64_t ts_duration;
+} EncodeFrameInput;
+
 // EncodeFrameParams contains per-frame encoding parameters decided upon by
 // av1_encode_strategy() and passed down to av1_encode()
-typedef struct {
+struct EncodeFrameParams {
  int error_resilient_mode;
+  FRAME_TYPE frame_type;
+  int primary_ref_frame;
+  int order_offset;
+  int show_frame;

  // This is a bitmask of which reference slots can be used in this frame
  int ref_frame_flags;

+  // Speed level to use for this frame: Bigger number means faster.
+  int speed;
+
  unsigned int *frame_flags;
-} EncodeFrameParams;
+};
+typedef struct EncodeFrameParams EncodeFrameParams;

 // EncodeFrameResults contains information about the result of encoding a
 // single frame
@@ -905,6 +930,7 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
                            const aom_rational_t *timebase);

 int av1_encode(AV1_COMP *const cpi, uint8_t *const dest,
+               const EncodeFrameInput *const frame_input,
               const EncodeFrameParams *const frame_params,
               EncodeFrameResults *const frame_results);

@@ -918,12 +944,12 @@ aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,

 int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags);

-void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags);
-
 int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);

 int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);

+void av1_set_frame_size(AV1_COMP *cpi, int width, int height);
+
 int av1_update_entropy(AV1_COMP *cpi, int update);

 int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
@@ -937,8 +963,19 @@ int av1_get_quantizer(struct AV1_COMP *cpi);

 int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size);

-int64_t timebase_units_to_ticks(const aom_rational_t *timebase, int64_t n);
-int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n);
+// av1 uses 10,000,000 ticks/second as time stamp
+#define TICKS_PER_SEC 10000000LL
+
+static INLINE int64_t timebase_units_to_ticks(const aom_rational_t *timebase,
+                                              int64_t n) {
+  return n * TICKS_PER_SEC * timebase->num / timebase->den;
+}
+
+static INLINE int64_t ticks_to_timebase_units(const aom_rational_t *timebase,
+                                              int64_t n) {
+  const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
+  return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
+}

 static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
  return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
@@ -975,6 +1012,13 @@ static INLINE int enc_is_ref_frame_buf(const AV1_COMMON *const cm,
  return (ref_frame <= ALTREF_FRAME);
 }

+static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, RefCntBuffer *buf) {
+  assert(buf != NULL);
+  ensure_mv_buffer(buf, cm);
+  buf->width = cm->width;
+  buf->height = cm->height;
+}
+
 // Token buffer is only used for palette tokens.
 static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols,
                                           int sb_size_log2,
@@ -1046,6 +1090,8 @@ static INLINE int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) {

 void av1_new_framerate(AV1_COMP *cpi, double framerate);

+void av1_setup_frame_size(AV1_COMP *cpi);
+
 #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))

 // Returns 1 if a frame is scaled and 0 otherwise.
@@ -1077,6 +1123,52 @@ static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) {
 // field.
 aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi);

+#if CONFIG_COLLECT_PARTITION_STATS
+static INLINE void av1_print_partition_stats(PartitionStats *part_stats) {
+  FILE *f = fopen("partition_stats.csv", "w");
+  if (!f) {
+    return;
+  }
+
+  fprintf(f, "bsize,redo,");
+  for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+    fprintf(f, "decision_%d,", part);
+  }
+  for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+    fprintf(f, "attempt_%d,", part);
+  }
+  fprintf(f, "\n");
+
+  const int bsizes[6] = { 128, 64, 32, 16, 8, 4 };
+
+  for (int bsize_idx = 0; bsize_idx < 6; bsize_idx++) {
+    fprintf(f, "%d,%d,", bsizes[bsize_idx], part_stats->partition_redo);
+    for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+      fprintf(f, "%d,", part_stats->partition_decisions[bsize_idx][part]);
+    }
+    for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+      fprintf(f, "%d,", part_stats->partition_attempts[bsize_idx][part]);
+    }
+    fprintf(f, "\n");
+  }
+  fclose(f);
+}
+
+static INLINE int av1_get_bsize_idx_for_part_stats(BLOCK_SIZE bsize) {
+  assert(bsize == BLOCK_128X128 || bsize == BLOCK_64X64 ||
+         bsize == BLOCK_32X32 || bsize == BLOCK_16X16 || bsize == BLOCK_8X8);
+  switch (bsize) {
+    case BLOCK_128X128: return 0;
+    case BLOCK_64X64: return 1;
+    case BLOCK_32X32: return 2;
+    case BLOCK_16X16: return 3;
+    case BLOCK_8X8: return 4;
+    case BLOCK_4X4: return 5;
+    default: assert(0 && "Invalid bsize for partition_stats."); return -1;
+  }
+}
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
@@ -284,6 +284,17 @@ static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx,
  return av1_cost_literal(1);
 }

+static const int golomb_bits_cost[32] = {
+  0,       512,     512 * 3, 512 * 3, 512 * 5, 512 * 5, 512 * 5, 512 * 5,
+  512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7,
+  512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9,
+  512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9
+};
+static const int golomb_cost_diff[32] = {
+  0,       512, 512 * 2, 0, 512 * 2, 0, 0, 0, 512 * 2, 0, 0, 0, 0, 0, 0, 0,
+  512 * 2, 0,   0,       0, 0,       0, 0, 0, 0,       0, 0, 0, 0, 0, 0, 0
+};
+
 static INLINE int get_golomb_cost(int abs_qc) {
  if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
    const int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
@@ -293,6 +304,27 @@ static INLINE int get_golomb_cost(int abs_qc) {
  return 0;
 }

+static INLINE int get_br_cost_with_diff(tran_low_t level, const int *coeff_lps,
+                                        int *diff) {
+  const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+  int golomb_bits = 0;
+  if (level <= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS)
+    *diff += coeff_lps[base_range + COEFF_BASE_RANGE + 1];
+
+  if (level >= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) {
+    int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+    if (r < 32) {
+      golomb_bits = golomb_bits_cost[r];
+      *diff += golomb_cost_diff[r];
+    } else {
+      golomb_bits = get_golomb_cost(level);
+      *diff += (r & (r - 1)) == 0 ? 1024 : 0;
+    }
+  }
+
+  return coeff_lps[base_range] + golomb_bits;
+}
+
 static INLINE int get_br_cost(tran_low_t level, const int *coeff_lps) {
  const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
  return coeff_lps[base_range] + get_golomb_cost(level);
@@ -732,7 +764,8 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb(

  av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);

-  const int(*lps_cost)[COEFF_BASE_RANGE + 1] = coeff_costs->lps_cost;
+  const int(*lps_cost)[COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1] =
+      coeff_costs->lps_cost;
  int c = eob - 1;
  {
    const int pos = scan[c];
@@ -758,7 +791,7 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb(
      }
    }
  }
-  const int(*base_cost)[4] = coeff_costs->base_cost;
+  const int(*base_cost)[8] = coeff_costs->base_cost;
  for (c = eob - 2; c >= 1; --c) {
    const int pos = scan[c];
    const int coeff_ctx = coeff_contexts[pos];
@@ -1262,21 +1295,28 @@ static int hbt_create_hashes(TxbInfo *txb_info,
                          txb_eob_costs, p, block, fast_mode, rate_cost);
 }

-static AOM_FORCE_INLINE int get_coeff_cost_simple(
+static AOM_FORCE_INLINE int get_two_coeff_cost_simple(
    int ci, tran_low_t abs_qc, int coeff_ctx,
    const LV_MAP_COEFF_COST *txb_costs, int bwl, TX_CLASS tx_class,
-    const uint8_t *levels) {
+    const uint8_t *levels, int *cost_low) {
  // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
  // and not the last (scan_idx != eob - 1)
  assert(ci > 0);
  int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+  int diff = 0;
+  if (abs_qc <= 3) diff = txb_costs->base_cost[coeff_ctx][abs_qc + 4];
  if (abs_qc) {
    cost += av1_cost_literal(1);
    if (abs_qc > NUM_BASE_LEVELS) {
      const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
-      cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
+      int brcost_diff = 0;
+      cost += get_br_cost_with_diff(abs_qc, txb_costs->lps_cost[br_ctx],
+                                    &brcost_diff);
+      diff += brcost_diff;
    }
  }
+  *cost_low = cost - diff;
+
  return cost;
 }

@@ -1369,13 +1409,23 @@ static INLINE void update_coeff_general(
    const int64_t rd = RDCOST(rdmult, rate, dist);

    tran_low_t qc_low, dqc_low;
-    get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
-    const tran_low_t abs_qc_low = abs_qc - 1;
-    const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift);
-    const int rate_low =
-        get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
-                               dc_sign_ctx, txb_costs, bwl, tx_class, levels);
-    const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+    tran_low_t abs_qc_low;
+    int64_t dist_low, rd_low;
+    int rate_low;
+    if (abs_qc == 1) {
+      abs_qc_low = qc_low = dqc_low = 0;
+      dist_low = dist0;
+      rate_low = txb_costs->base_cost[coeff_ctx][0];
+    } else {
+      get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+      abs_qc_low = abs_qc - 1;
+      dist_low = get_coeff_dist(tqc, dqc_low, shift);
+      rate_low =
+          get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
+                                 dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+    }
+
+    rd_low = RDCOST(rdmult, rate_low, dist_low);
    if (rd_low < rd) {
      qcoeff[ci] = qc_low;
      dqcoeff[ci] = dqc_low;
@@ -1409,28 +1459,28 @@ static AOM_FORCE_INLINE void update_coeff_simple(
    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
  } else {
    const tran_low_t abs_qc = abs(qc);
-    const tran_low_t tqc = tcoeff[ci];
-    const tran_low_t dqc = dqcoeff[ci];
-    const int rate = get_coeff_cost_simple(ci, abs_qc, coeff_ctx, txb_costs,
-                                           bwl, tx_class, levels);
-    if (abs(dqc) < abs(tqc)) {
+    const tran_low_t abs_tqc = abs(tcoeff[ci]);
+    const tran_low_t abs_dqc = abs(dqcoeff[ci]);
+    int rate_low = 0;
+    const int rate = get_two_coeff_cost_simple(
+        ci, abs_qc, coeff_ctx, txb_costs, bwl, tx_class, levels, &rate_low);
+    if (abs_dqc < abs_tqc) {
      *accu_rate += rate;
      return;
    }
-    const int64_t dist = get_coeff_dist(tqc, dqc, shift);
+
+    const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift);
    const int64_t rd = RDCOST(rdmult, rate, dist);

-    const int sign = (qc < 0) ? 1 : 0;
-    tran_low_t qc_low, dqc_low;
-    get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
    const tran_low_t abs_qc_low = abs_qc - 1;
-    const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift);
-    const int rate_low = get_coeff_cost_simple(
-        ci, abs_qc_low, coeff_ctx, txb_costs, bwl, tx_class, levels);
+    const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+    const int64_t dist_low = get_coeff_dist(abs_tqc, abs_dqc_low, shift);
    const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+
    if (rd_low < rd) {
-      qcoeff[ci] = qc_low;
-      dqcoeff[ci] = dqc_low;
+      const int sign = (qc < 0) ? 1 : 0;
+      qcoeff[ci] = (-sign ^ abs_qc_low) + sign;
+      dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign;
      levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
      *accu_rate += rate_low;
    } else {
@@ -1468,14 +1518,24 @@ static AOM_FORCE_INLINE void update_coeff_eob(
    int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist);

    tran_low_t qc_low, dqc_low;
-    get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
-    const tran_low_t abs_qc_low = abs_qc - 1;
-    const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift) - dist0;
-    const int rate_low =
-        get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx, dc_sign_ctx,
-                               txb_costs, bwl, tx_class, levels);
-    const int64_t rd_low =
-        RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
+    tran_low_t abs_qc_low;
+    int64_t dist_low, rd_low;
+    int rate_low;
+    if (abs_qc == 1) {
+      abs_qc_low = 0;
+      dqc_low = qc_low = 0;
+      dist_low = 0;
+      rate_low = txb_costs->base_cost[coeff_ctx][0];
+      rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist);
+    } else {
+      get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+      abs_qc_low = abs_qc - 1;
+      dist_low = get_coeff_dist(tqc, dqc_low, shift) - dist0;
+      rate_low =
+          get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx,
+                                 dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+      rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
+    }

    int lower_level_new_eob = 0;
    const int new_eob = si + 1;
@@ -421,11 +421,9 @@ static void create_enc_workers(AV1_COMP *cpi, int num_workers) {
          (int32_t *)aom_memalign(
              16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf)));

-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
      CHECK_MEM_ERROR(cm, thread_data->td->inter_modes_info,
                      (InterModesInfo *)aom_malloc(
                          sizeof(*thread_data->td->inter_modes_info)));
-#endif

      for (int x = 0; x < 2; x++)
        for (int y = 0; y < 2; y++)
@@ -544,9 +542,7 @@ static void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
      thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf;
      thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf;

-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
      thread_data->td->mb.inter_modes_info = thread_data->td->inter_modes_info;
-#endif
      for (int x = 0; x < 2; x++) {
        for (int y = 0; y < 2; y++) {
          memcpy(thread_data->td->hash_value_buffer[x][y],
@@ -662,9 +658,7 @@ void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
      this_tile->row_mt_info.current_mi_row = this_tile->tile_info.mi_row_start;
      this_tile->row_mt_info.num_threads_working = 0;

-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
      av1_inter_mode_data_init(this_tile);
-#endif
      av1_zero_above_context(cm, &cpi->td.mb.e_mbd,
                             this_tile->tile_info.mi_col_start,
                             this_tile->tile_info.mi_col_end, tile_row);
@@ -36,6 +36,7 @@
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/encode_strategy.h"
 #include "av1/encoder/extend.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/mcomp.h"
@@ -380,7 +381,7 @@ static void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,

  // Override the default variance function to use MSE.
  v_fn_ptr.vf = get_block_variance_fn(bsize);
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
    v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
  }

@@ -449,18 +450,6 @@ static int find_fp_qindex(aom_bit_depth_t bit_depth) {
  return i;
 }

-static void set_first_pass_params(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  if (!cpi->refresh_alt_ref_frame && (cm->current_frame.frame_number == 0 ||
-                                      (cpi->frame_flags & FRAMEFLAGS_KEY))) {
-    cm->current_frame.frame_type = KEY_FRAME;
-  } else {
-    cm->current_frame.frame_type = INTER_FRAME;
-  }
-  // Do not use periodic key frames.
-  cpi->rc.frames_to_key = INT_MAX;
-}
-
 static double raw_motion_error_stdev(int *raw_motion_err_list,
                                     int raw_motion_err_counts) {
  int64_t sum_raw_err = 0;
@@ -486,7 +475,7 @@ static double raw_motion_error_stdev(int *raw_motion_err_list,

 #define UL_INTRA_THRESH 50
 #define INVALID_ROW -1
-void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
+void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
  int mb_row, mb_col;
  MACROBLOCK *const x = &cpi->td.mb;
  AV1_COMMON *const cm = &cpi->common;
@@ -558,7 +547,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
  brightness_factor = 0.0;
  neutral_count = 0.0;

-  set_first_pass_params(cpi);
+  // Do not use periodic key frames.
+  cpi->rc.frames_to_key = INT_MAX;
+
  av1_set_quantizer(cm, qindex);

  av1_setup_block_planes(&x->e_mbd, seq_params->subsampling_x,
@@ -701,14 +692,15 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
      // Accumulate the intra error.
      intra_error += (int64_t)this_error;

-      int stride = x->plane[0].src.stride;
+      const int hbd = is_cur_buf_hbd(xd);
+      const int stride = x->plane[0].src.stride;
      uint8_t *buf = x->plane[0].src.buf;
-      for (int r8 = 0; r8 < 2; ++r8)
+      for (int r8 = 0; r8 < 2; ++r8) {
        for (int c8 = 0; c8 < 2; ++c8) {
-          int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
          frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input(
              buf + c8 * 8 + r8 * 8 * stride, stride, hbd);
        }
+      }

 #if CONFIG_FP_MB_STATS
      if (cpi->use_fp_mb_stats) {
@@ -730,7 +722,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
        struct buf_2d unscaled_last_source_buf_2d;

        xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        if (is_cur_buf_hbd(xd)) {
          motion_error = highbd_get_prediction_error(
              bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
        } else {
@@ -745,7 +737,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
            cpi->unscaled_last_source->y_buffer + recon_yoffset;
        unscaled_last_source_buf_2d.stride =
            cpi->unscaled_last_source->y_stride;
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        if (is_cur_buf_hbd(xd)) {
          raw_motion_error = highbd_get_prediction_error(
              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
        } else {
@@ -777,7 +769,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
            int gf_motion_error;

            xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
-            if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            if (is_cur_buf_hbd(xd)) {
              gf_motion_error = highbd_get_prediction_error(
                  bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
            } else {
@@ -854,8 +846,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
          xd->mi[0]->tx_size = TX_4X4;
          xd->mi[0]->ref_frame[0] = LAST_FRAME;
          xd->mi[0]->ref_frame[1] = NONE_FRAME;
-          av1_build_inter_predictors_sby(cm, xd, mb_row * mb_scale,
-                                         mb_col * mb_scale, NULL, bsize);
+          av1_enc_build_inter_predictor(cm, xd, mb_row * mb_scale,
+                                        mb_col * mb_scale, NULL, bsize,
+                                        AOM_PLANE_Y, AOM_PLANE_Y);
          av1_encode_sby_pass1(cm, x, bsize);
          sum_mvr += mv.row;
          sum_mvr_abs += abs(mv.row);
@@ -1038,7 +1031,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
    // TODO(paulwilkins):  Handle the case when duration is set to 0, or
    // something less than the full time between subsequent values of
    // cpi->source_time_stamp.
-    fps.duration = (double)(source->ts_end - source->ts_start);
+    fps.duration = (double)ts_duration;

    // Don't want to do output stats with a stack variable!
    twopass->this_frame_stats = fps;
@@ -1566,7 +1559,6 @@ static int calculate_boost_bits(int frame_count, int boost,
                0);
 }

-#if USE_SYMM_MULTI_LAYER
 // #define CHCEK_GF_PARAMETER
 #ifdef CHCEK_GF_PARAMETER
 void check_frame_params(GF_GROUP *const gf_group, int gf_interval,
@@ -1693,7 +1685,6 @@ static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group,
 // Given the maximum allowed height of the pyramid structure, return the maximum
 // GF length supported by the same.
 static INLINE int get_max_gf_length(int max_pyr_height) {
-#if CONFIG_FIX_GF_LENGTH
  // We allow a frame to have at most two left/right descendants before changing
  // them into to a subtree, i.e., we allow the following structure:
  /*                    OUT_OF_ORDER_FRAME
@@ -1710,9 +1701,6 @@ static INLINE int get_max_gf_length(int max_pyr_height) {
      return MAX_GF_INTERVAL;  // Special case: uses the old pyramid structure.
    default: assert(0 && "Invalid max_pyr_height"); return -1;
  }
-#else
-  return 16;
-#endif  // CONFIG_FIX_GF_LENGTH
 }

 // Given the maximum allowed height of the pyramid structure, return the fixed
@@ -1722,11 +1710,12 @@ int av1_rc_get_fixed_gf_length(int max_pyr_height) {
  return AOMMIN(max_gf_length_allowed, MAX_GF_INTERVAL);
 }

-static void define_customized_gf_group_structure(AV1_COMP *cpi) {
+static void define_customized_gf_group_structure(
+    AV1_COMP *cpi, const EncodeFrameParams *const frame_params) {
  RATE_CONTROL *const rc = &cpi->rc;
  TWO_PASS *const twopass = &cpi->twopass;
  GF_GROUP *const gf_group = &twopass->gf_group;
-  const int key_frame = cpi->common.current_frame.frame_type == KEY_FRAME;
+  const int key_frame = frame_params->frame_type == KEY_FRAME;

  assert(rc->baseline_gf_interval >= MIN_GF_INTERVAL &&
         rc->baseline_gf_interval <=
@@ -1796,142 +1785,11 @@ static void define_customized_gf_group_structure(AV1_COMP *cpi) {
 // It is an example of how to define a GF stucture manually. The function will
 // result in exactly the same GF group structure as
 // define_customized_gf_group_structure() when rc->baseline_gf_interval == 4
-#if USE_MANUAL_GF4_STRUCT
-#define GF_INTERVAL_4 4
-static const unsigned char gf4_multi_layer_params[][GF_FRAME_PARAMS] = {
-  {
-      // gf_group->index == 0 (Frame 0)
-      // It can also be KEY frame. Will assign the proper value
-      // in define_gf_group_structure
-      OVERLAY_UPDATE,  // update_type (default value)
-      0,               // arf_src_offset
-      0,               // arf_pos_in_gf
-      0                // arf_update_idx
-  },
-  {
-      // gf_group->index == 1 (Frame 4)
-      ARF_UPDATE,         // update_type
-      GF_INTERVAL_4 - 1,  // arf_src_offset
-      0,                  // arf_pos_in_gf
-      0                   // arf_update_idx
-  },
-  {
-      // gf_group->index == 2 (Frame 2)
-      INTNL_ARF_UPDATE,          // update_type
-      (GF_INTERVAL_4 >> 1) - 1,  // arf_src_offset
-      0,                         // arf_pos_in_gf
-      0                          // arf_update_idx
-  },
-  {
-      // gf_group->index == 3 (Frame 1)
-      LAST_BIPRED_UPDATE,  // update_type
-      0,                   // arf_src_offset
-      0,                   // arf_pos_in_gf
-      0                    // arf_update_idx
-  },

-  {
-      // gf_group->index == 4 (Frame 2 - OVERLAY)
-      INTNL_OVERLAY_UPDATE,  // update_type
-      0,                     // arf_src_offset
-      2,                     // arf_pos_in_gf
-      0                      // arf_update_idx
-  },
-  {
-      // gf_group->index == 5 (Frame 3)
-      LF_UPDATE,  // update_type
-      0,          // arf_src_offset
-      0,          // arf_pos_in_gf
-      1           // arf_update_idx
-  }
-};
-
-static int define_gf_group_structure_4(AV1_COMP *cpi) {
-  RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &twopass->gf_group;
-  const int key_frame = cpi->common.current_frame.frame_type == KEY_FRAME;
-
-  assert(rc->baseline_gf_interval == GF_INTERVAL_4);
-
-  const int gf_update_frames = rc->baseline_gf_interval + 2;
-  int frame_index;
-
-  for (frame_index = 0; frame_index < gf_update_frames; ++frame_index) {
-    int param_idx = 0;
-
-    gf_group->bidir_pred_enabled[frame_index] = 0;
-
-    if (frame_index == 0) {
-      // gf_group->arf_src_offset[frame_index] = 0;
-      gf_group->brf_src_offset[frame_index] = 0;
-      gf_group->bidir_pred_enabled[frame_index] = 0;
-
-      // For key frames the frame target rate is already set and it
-      // is also the golden frame.
-      if (key_frame) continue;
-
-      gf_group->update_type[frame_index] =
-          gf4_multi_layer_params[frame_index][param_idx++];
-
-      if (rc->source_alt_ref_active) {
-        gf_group->update_type[frame_index] = OVERLAY_UPDATE;
-      } else {
-        gf_group->update_type[frame_index] = GF_UPDATE;
-      }
-      param_idx++;
-    } else {
-      gf_group->update_type[frame_index] =
-          gf4_multi_layer_params[frame_index][param_idx++];
-    }
-
-    // setup other parameters
-    gf_group->rf_level[frame_index] =
-        update_type_2_rf_level(gf_group->update_type[frame_index]);
-
-    // == arf_src_offset ==
-    gf_group->arf_src_offset[frame_index] =
-        gf4_multi_layer_params[frame_index][param_idx++];
-
-    // == arf_pos_in_gf ==
-    gf_group->arf_pos_in_gf[frame_index] =
-        gf4_multi_layer_params[frame_index][param_idx++];
-
-    // == arf_update_idx ==
-    gf_group->brf_src_offset[frame_index] =
-        gf4_multi_layer_params[frame_index][param_idx];
-  }
-
-  // NOTE: We need to configure the frame at the end of the sequence + 1 that
-  //       will be the start frame for the next group. Otherwise prior to the
-  //       call to av1_rc_get_second_pass_params() the data will be undefined.
-  gf_group->arf_update_idx[frame_index] = 0;
-  gf_group->arf_ref_idx[frame_index] = 0;
-
-  if (rc->source_alt_ref_pending) {
-    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
-    gf_group->rf_level[frame_index] = INTER_NORMAL;
-
-  } else {
-    gf_group->update_type[frame_index] = GF_UPDATE;
-    gf_group->rf_level[frame_index] = GF_ARF_STD;
-  }
-
-  gf_group->bidir_pred_enabled[frame_index] = 0;
-  gf_group->brf_src_offset[frame_index] = 0;
-
-  // This value is only used for INTNL_OVERLAY_UPDATE
-  gf_group->arf_pos_in_gf[frame_index] = 0;
-
-  return gf_update_frames;
-}
-#endif  // USE_MANUAL_GF4_STRUCT
-#endif  // USE_SYMM_MULTI_LAYER
-
-static void define_gf_group_structure(AV1_COMP *cpi) {
+static void define_gf_group_structure(
+    AV1_COMP *cpi, const EncodeFrameParams *const frame_params) {
  RATE_CONTROL *const rc = &cpi->rc;

-#if USE_SYMM_MULTI_LAYER
  const int max_pyr_height = cpi->oxcf.gf_max_pyr_height;
  const int valid_customized_gf_length =
      max_pyr_height >= MIN_PYRAMID_LVL && max_pyr_height <= MAX_PYRAMID_LVL &&
@@ -1940,24 +1798,18 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
  // used the new structure only if extra_arf is allowed
  if (valid_customized_gf_length && rc->source_alt_ref_pending &&
      cpi->extra_arf_allowed > 0) {
-#if USE_MANUAL_GF4_STRUCT
-    if (rc->baseline_gf_interval == 4)
-      define_gf_group_structure_4(cpi);
-    else
-#endif
-      define_customized_gf_group_structure(cpi);
+    define_customized_gf_group_structure(cpi, frame_params);
    cpi->new_bwdref_update_rule = 1;
    return;
  } else {
    cpi->new_bwdref_update_rule = 0;
  }
-#endif

  TWO_PASS *const twopass = &cpi->twopass;
  GF_GROUP *const gf_group = &twopass->gf_group;
  int i;
  int frame_index = 0;
-  const int key_frame = cpi->common.current_frame.frame_type == KEY_FRAME;
+  const int key_frame = frame_params->frame_type == KEY_FRAME;

  // The use of bi-predictive frames are only enabled when following 3
  // conditions are met:
@@ -2168,35 +2020,28 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
  gf_group->brf_src_offset[frame_index] = 0;
 }

-#if USE_SYMM_MULTI_LAYER
-#define NEW_MULTI_LVL_BOOST_VBR_ALLOC 1
-
-#if NEW_MULTI_LVL_BOOST_VBR_ALLOC
 #define LEAF_REDUCTION_FACTOR 0.75
 static double lvl_budget_factor[MAX_PYRAMID_LVL - 1][MAX_PYRAMID_LVL - 1] = {
  { 1.0, 0.0, 0.0 }, { 0.6, 0.4, 0 }, { 0.45, 0.35, 0.20 }
 };
-#endif  // NEW_MULTI_LVL_BOOST_VBR_ALLOC
-#endif  // USE_SYMM_MULTI_LAYER
-static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
-                                   double group_error, int gf_arf_bits) {
+static void allocate_gf_group_bits(
+    AV1_COMP *cpi, int64_t gf_group_bits, double group_error, int gf_arf_bits,
+    const EncodeFrameParams *const frame_params) {
  RATE_CONTROL *const rc = &cpi->rc;
  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
  TWO_PASS *const twopass = &cpi->twopass;
  GF_GROUP *const gf_group = &twopass->gf_group;
  int i;
  int frame_index = 0;
-  int key_frame;
+  const int key_frame = frame_params->frame_type == KEY_FRAME;
  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
  int64_t total_group_bits = gf_group_bits;
  int ext_arf_boost[MAX_EXT_ARFS];

-  define_gf_group_structure(cpi);
+  define_gf_group_structure(cpi, frame_params);

  av1_zero_array(ext_arf_boost, MAX_EXT_ARFS);

-  key_frame = cpi->common.current_frame.frame_type == KEY_FRAME;
-
  // For key frames the frame target rate is already set and it
  // is also the golden frame.
  // === [frame_index == 0] ===
@@ -2232,13 +2077,9 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
    }
  }

-#if USE_SYMM_MULTI_LAYER
-#if NEW_MULTI_LVL_BOOST_VBR_ALLOC
  // Save.
  const int tmp_frame_index = frame_index;
  int budget_reduced_from_leaf_level = 0;
-#endif  // NEW_MULTI_LVL_BOOST_VBR_ALLOC
-#endif  // USE_SYMM_MULTI_LAYER

  // Allocate bits to the other frames in the group.
  const int normal_frames =
@@ -2269,7 +2110,6 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
      // TODO(zoeliu): To investigate whether the allocated bits on
      // BIPRED_UPDATE frames need to be further adjusted.
      gf_group->bit_allocation[frame_index] = target_frame_size;
-#if USE_SYMM_MULTI_LAYER
    } else if (cpi->new_bwdref_update_rule &&
               gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) {
      assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL &&
@@ -2280,23 +2120,16 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,

      gf_group->bit_allocation[arf_pos] = target_frame_size;
      // Note: Boost, if needed, is added in the next loop.
-#endif  // USE_SYMM_MULTI_LAYER
    } else {
      assert(gf_group->update_type[frame_index] == LF_UPDATE ||
             gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE);
      gf_group->bit_allocation[frame_index] = target_frame_size;
-#if MULTI_LVL_BOOST_VBR_CQ
      if (cpi->new_bwdref_update_rule) {
-#if NEW_MULTI_LVL_BOOST_VBR_ALLOC
        const int this_budget_reduction =
            (int)(target_frame_size * LEAF_REDUCTION_FACTOR);
        gf_group->bit_allocation[frame_index] -= this_budget_reduction;
        budget_reduced_from_leaf_level += this_budget_reduction;
-#else
-        gf_group->bit_allocation[frame_index] -= (target_frame_size >> 1);
-#endif  // NEW_MULTI_LVL_BOOST_VBR_ALLOC
      }
-#endif  // MULTI_LVL_BOOST_VBR_CQ
    }

    ++frame_index;
@@ -2308,8 +2141,6 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
    }
  }

-#if USE_SYMM_MULTI_LAYER
-#if MULTI_LVL_BOOST_VBR_CQ
  if (budget_reduced_from_leaf_level > 0) {
    // Restore.
    frame_index = tmp_frame_index;
@@ -2323,16 +2154,11 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
        const int arf_pos = gf_group->arf_pos_in_gf[frame_index];
        const int this_lvl = gf_group->pyramid_level[arf_pos];
        const int dist2top = gf_group->pyramid_height - 1 - this_lvl;
-#if NEW_MULTI_LVL_BOOST_VBR_ALLOC
        const double lvl_boost_factor =
            lvl_budget_factor[gf_group->pyramid_height - 2][dist2top];
        const int extra_size =
            (int)(budget_reduced_from_leaf_level * lvl_boost_factor /
                  gf_group->pyramid_lvl_nodes[this_lvl]);
-#else
-        const int target_frame_size = gf_group->bit_allocation[arf_pos];
-        const int extra_size = target_frame_size >> dist2top;
-#endif  // NEW_MULTI_LVL_BOOST_VBR_ALLOC
        gf_group->bit_allocation[arf_pos] += extra_size;
      }
      ++frame_index;
@@ -2344,14 +2170,8 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
      }
    }
  }
-#endif  // MULTI_LVL_BOOST_VBR_CQ
-#endif  // USE_SYMM_MULTI_LAYER

-#if USE_SYMM_MULTI_LAYER
  if (cpi->new_bwdref_update_rule == 0 && rc->source_alt_ref_pending) {
-#else
-  if (rc->source_alt_ref_pending) {
-#endif
    if (cpi->num_extra_arfs) {
      // NOTE: For bit allocation, move the allocated bits associated with
      //       INTNL_OVERLAY_UPDATE to the corresponding INTNL_ARF_UPDATE.
@@ -2379,12 +2199,11 @@ static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion) {
         (kf_zero_motion >= STATIC_KF_GROUP_THRESH);
 }

-#if CONFIG_FIX_GF_LENGTH
 #define ARF_ABS_ZOOM_THRESH 4.4
-#endif  // CONFIG_FIX_GF_LENGTH

 // Analyse and define a gf/arf group.
-static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
+                            const EncodeFrameParams *const frame_params) {
  AV1_COMMON *const cm = &cpi->common;
  RATE_CONTROL *const rc = &cpi->rc;
  AV1EncoderConfig *const oxcf = &cpi->oxcf;
@@ -2394,10 +2213,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  int i;

  double boost_score = 0.0;
-#if !CONFIG_FIX_GF_LENGTH
-  double old_boost_score = 0.0;
-  int active_max_gf_interval;
-#endif  // !CONFIG_FIX_GF_LENGTH
  int active_min_gf_interval;
  double gf_group_err = 0.0;
 #if GROUP_ADAPTIVE_MAXQ
@@ -2427,14 +2242,15 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  int64_t gf_group_bits;
  double gf_group_error_left;
  int gf_arf_bits;
-  const int is_key_frame = frame_is_intra_only(cm);
-  const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
+  const int is_intra_only = frame_params->frame_type == KEY_FRAME ||
+                            frame_params->frame_type == INTRA_ONLY_FRAME;
+  const int arf_active_or_kf = is_intra_only || rc->source_alt_ref_active;

  cpi->extra_arf_allowed = 1;

  // Reset the GF group data structures unless this is a key
  // frame in which case it will already have been done.
-  if (is_key_frame == 0) {
+  if (!is_intra_only) {
    av1_zero(twopass->gf_group);
  }

@@ -2462,35 +2278,8 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  const double mv_ratio_accumulator_thresh =
      (cpi->initial_height + cpi->initial_width) / 4.0;

-#if CONFIG_FIX_GF_LENGTH
-  // TODO(urvang): Try the 'else' like logic to vary min and max interval.
+  // TODO(urvang): Try logic to vary min and max interval based on q.
  active_min_gf_interval = rc->min_gf_interval;
-#else
-  // Set a maximum and minimum interval for the GF group.
-  // If the image appears almost completely static we can extend beyond this.
-  {
-    int int_max_q = (int)(av1_convert_qindex_to_q(
-        twopass->active_worst_quality, cpi->common.seq_params.bit_depth));
-    int int_lbq = (int)(av1_convert_qindex_to_q(
-        rc->last_boosted_qindex, cpi->common.seq_params.bit_depth));
-
-    active_min_gf_interval = rc->min_gf_interval + AOMMIN(2, int_max_q / 200);
-    if (active_min_gf_interval > rc->max_gf_interval)
-      active_min_gf_interval = rc->max_gf_interval;
-
-    // The value chosen depends on the active Q range. At low Q we have
-    // bits to spare and are better with a smaller interval and smaller boost.
-    // At high Q when there are few bits to spare we are better with a longer
-    // interval to spread the cost of the GF.
-    active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6));
-
-    // We have: active_min_gf_interval <= rc->max_gf_interval
-    if (active_max_gf_interval < active_min_gf_interval)
-      active_max_gf_interval = active_min_gf_interval;
-    else if (active_max_gf_interval > rc->max_gf_interval)
-      active_max_gf_interval = rc->max_gf_interval;
-  }
-#endif  // CONFIG_FIX_GF_LENGTH

  double avg_sr_coded_error = 0;
  double avg_raw_err_stdev = 0;
@@ -2552,7 +2341,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
    boost_score +=
        decay_accumulator *
        calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST);
-#if CONFIG_FIX_GF_LENGTH
    // If almost totally static, we will not use the the fixed GF length later,
    // so we can continue for more frames.
    if (i >= (av1_rc_get_fixed_gf_length(oxcf->gf_max_pyr_height) + 1) &&
@@ -2570,39 +2358,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         abs_mv_in_out_accumulator > ARF_ABS_ZOOM_THRESH)) {
      break;
    }
-#else
-    // Break out conditions.
-    // Break at maximum of active_max_gf_interval unless almost totally static.
-    //
-    // Note that the addition of a test of rc->source_alt_ref_active is
-    // deliberate. The effect of this is that after a normal altref group even
-    // if the material is static there will be one normal length GF group
-    // before allowing longer GF groups. The reason for this is that in cases
-    // such as slide shows where slides are separated by a complex transition
-    // such as a fade, the arf group spanning the transition may not be coded
-    // at a very high quality and hence this frame (with its overlay) is a
-    // poor golden frame to use for an extended group.
-    if ((i >= (active_max_gf_interval + arf_active_or_kf) &&
-         ((zero_motion_accumulator < 0.995) || (rc->source_alt_ref_active))) ||
-        (
-            // Don't break out with a very short interval.
-            (i >= active_min_gf_interval + arf_active_or_kf) &&
-            (!flash_detected) &&
-            ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
-             (abs_mv_in_out_accumulator > 3.0) ||
-             (mv_in_out_accumulator < -2.0) ||
-             ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) {
-      // If GF group interval is < 12, we force it to be 8. Otherwise,
-      // if it is >= 12, we keep it as is.
-      // NOTE: 'i' is 1 more than the GF group interval candidate that is being
-      //       checked.
-      if (i == (8 + 1) || i >= (12 + 1)) {
-        boost_score = old_boost_score;
-        break;
-      }
-    }
-    old_boost_score = boost_score;
-#endif  // CONFIG_FIX_GF_LENGTH
    *this_frame = next_frame;
  }
  twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
@@ -2638,7 +2393,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 #define REDUCE_GF_LENGTH_TO_KEY_THRESH 9
 #define REDUCE_GF_LENGTH_BY 1
  int alt_offset = 0;
-#if REDUCE_LAST_GF_LENGTH
  // The length reduction strategy is tweaked using AOM_Q mode, and doesn't work
  // for VBR mode.
  // Also, we don't have do adjustment for lossless mode.
@@ -2670,7 +2424,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      i -= roll_back;
    }
  }
-#endif  // REDUCE_LAST_GF_LENGTH

  // Should we use the alternate reference frame.
  if (use_alt_ref) {
@@ -2713,7 +2466,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
    rc->baseline_gf_interval = i - rc->source_alt_ref_pending;
  }

-#if REDUCE_LAST_ALT_BOOST
 #define LAST_ALR_BOOST_FACTOR 0.2f
  rc->arf_boost_factor = 1.0;
  if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf)) {
@@ -2723,7 +2475,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
    }
  }
-#endif

  if (!cpi->extra_arf_allowed) {
    cpi->num_extra_arfs = 0;
@@ -2732,7 +2483,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
    // Note: When new pyramid structure is used through
    // 'define_customized_gf_group_structure()' function, this value is
    // overridden.
-#if USE_SYMM_MULTI_LAYER
    if (rc->baseline_gf_interval == MIN_GF_INTERVAL &&
        rc->source_alt_ref_pending) {
      cpi->num_extra_arfs = 1;
@@ -2741,18 +2491,8 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
                                                     rc->source_alt_ref_pending,
                                                     oxcf->gf_max_pyr_height);
    }
-#else
-    cpi->num_extra_arfs = get_number_of_extra_arfs(rc->baseline_gf_interval,
-                                                   rc->source_alt_ref_pending,
-                                                   oxcf->gf_max_pyr_height);
-#endif  // USE_SYMM_MULTI_LAYER
  }

-#if !USE_SYMM_MULTI_LAYER
-  // Currently at maximum two extra ARFs' are allowed
-  assert(cpi->num_extra_arfs <= MAX_EXT_ARFS);
-#endif
-
  rc->frames_till_gf_update_due = rc->baseline_gf_interval;

  rc->bipred_group_interval = BFG_INTERVAL;
@@ -2814,20 +2554,21 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  // also a key frame in which case it has already been accounted for.
  if (rc->source_alt_ref_pending) {
    gf_group_error_left = gf_group_err - mod_frame_err;
-  } else if (is_key_frame == 0) {
+  } else if (!is_intra_only) {
    gf_group_error_left = gf_group_err - gf_first_frame_err;
  } else {
    gf_group_error_left = gf_group_err;
  }

  // Allocate bits to each of the frames in the GF group.
-  allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits);
+  allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits,
+                         frame_params);

  // Reset the file position.
  reset_fpf_position(twopass, start_pos);

  // Calculate a section intra ratio used in setting max loop filter.
-  if (cpi->common.current_frame.frame_type != KEY_FRAME) {
+  if (frame_params->frame_type != KEY_FRAME) {
    twopass->section_intra_rating = calculate_section_intra_ratio(
        start_pos, twopass->stats_in_end, rc->baseline_gf_interval);
  }
@@ -2966,7 +2707,6 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {

  av1_zero(next_frame);

-  cpi->common.current_frame.frame_type = KEY_FRAME;
  rc->frames_since_key = 0;

  // Reset the GF group data structures.
@@ -3195,51 +2935,6 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  twopass->modified_error_left -= kf_group_err;
 }

-void av1_configure_buffer_updates_firstpass(AV1_COMP *cpi,
-                                            FRAME_UPDATE_TYPE update_type) {
-  RATE_CONTROL *rc = &cpi->rc;
-
-  cpi->refresh_last_frame = 1;
-  cpi->refresh_golden_frame = 0;
-  cpi->refresh_bwd_ref_frame = 0;
-  cpi->refresh_alt2_ref_frame = 0;
-  cpi->refresh_alt_ref_frame = 0;
-
-  rc->is_bwd_ref_frame = 0;
-
-  switch (update_type) {
-    case ARF_UPDATE:
-      cpi->refresh_alt_ref_frame = 1;
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-
-      rc->is_src_frame_alt_ref = 0;
-      break;
-    case INTNL_ARF_UPDATE:
-      cpi->refresh_alt2_ref_frame = 1;
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-      rc->is_src_frame_alt_ref = 0;
-      rc->is_src_frame_ext_arf = 0;
-
-      break;
-    case BIPRED_UPDATE:
-      cpi->refresh_bwd_ref_frame = 1;
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-
-      rc->is_bwd_ref_frame = 1;
-      break;
-    default: break;
-  }
-}
-
 static int is_skippable_frame(const AV1_COMP *cpi) {
  // If the current frame does not have non-zero motion vector detected in the
  // first  pass, and so do its previous and forward frames, then this frame
@@ -3259,7 +2954,8 @@ static int is_skippable_frame(const AV1_COMP *cpi) {
          twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
 }

-void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
+void av1_rc_get_second_pass_params(AV1_COMP *cpi,
+                                   EncodeFrameParams *const frame_params) {
  AV1_COMMON *const cm = &cpi->common;
  CurrentFrame *const current_frame = &cm->current_frame;
  RATE_CONTROL *const rc = &cpi->rc;
@@ -3278,16 +2974,16 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
  // advance the input pointer as we already have what we need.
  if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
      gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
-    av1_configure_buffer_updates(cpi);
+    av1_configure_buffer_updates(cpi, gf_group->update_type[gf_group->index]);
    target_rate = gf_group->bit_allocation[gf_group->index];
    target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate);
    rc->base_frame_target = target_rate;

    if (cpi->no_show_kf) {
      assert(gf_group->update_type[gf_group->index] == ARF_UPDATE);
-      current_frame->frame_type = KEY_FRAME;
+      frame_params->frame_type = KEY_FRAME;
    } else {
-      current_frame->frame_type = INTER_FRAME;
+      frame_params->frame_type = INTER_FRAME;
    }

    // Do the firstpass stats indicate that this frame is skippable for the
@@ -3342,16 +3038,17 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
  if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
    FIRSTPASS_STATS this_frame_copy;
    this_frame_copy = this_frame;
+    frame_params->frame_type = KEY_FRAME;
    // Define next KF group and assign bits to it.
    find_next_key_frame(cpi, &this_frame);
    this_frame = this_frame_copy;
  } else {
-    current_frame->frame_type = INTER_FRAME;
+    frame_params->frame_type = INTER_FRAME;
  }

  // Define a new GF/ARF group. (Should always enter here for key frames).
  if (rc->frames_till_gf_update_due == 0) {
-    define_gf_group(cpi, &this_frame);
+    define_gf_group(cpi, &this_frame, frame_params);

    rc->frames_till_gf_update_due = rc->baseline_gf_interval;

@@ -3369,7 +3066,7 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
 #endif
  }

-  av1_configure_buffer_updates(cpi);
+  av1_configure_buffer_updates(cpi, gf_group->update_type[gf_group->index]);

  // Do the firstpass stats indicate that this frame is skippable for the
  // partition search?
@@ -3379,7 +3076,7 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) {

  target_rate = gf_group->bit_allocation[gf_group->index];

-  if (cpi->common.current_frame.frame_type == KEY_FRAME)
+  if (frame_params->frame_type == KEY_FRAME)
    target_rate = av1_rc_clamp_iframe_target_size(cpi, target_rate);
  else
    target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate);
@@ -114,12 +114,10 @@ typedef struct {
  unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 1];
  unsigned char arf_update_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
  unsigned char arf_ref_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
-#if USE_SYMM_MULTI_LAYER
  unsigned char arf_pos_in_gf[MAX_STATIC_GF_GROUP_LENGTH + 1];
  unsigned char pyramid_level[MAX_STATIC_GF_GROUP_LENGTH + 1];
  unsigned char pyramid_height;
  unsigned char pyramid_lvl_nodes[MAX_PYRAMID_LVL];
-#endif  // USE_SYMM_MULTI_LAYER
  unsigned char brf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 1];
  unsigned char bidir_pred_enabled[MAX_STATIC_GF_GROUP_LENGTH + 1];
  int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 1];
@@ -173,16 +171,16 @@ typedef struct {
 } TWO_PASS;

 struct AV1_COMP;
+struct EncodeFrameParams;

 void av1_init_first_pass(struct AV1_COMP *cpi);
 void av1_rc_get_first_pass_params(struct AV1_COMP *cpi);
-void av1_first_pass(struct AV1_COMP *cpi, const struct lookahead_entry *source);
+void av1_first_pass(struct AV1_COMP *cpi, const int64_t ts_duration);
 void av1_end_first_pass(struct AV1_COMP *cpi);

 void av1_init_second_pass(struct AV1_COMP *cpi);
-void av1_rc_get_second_pass_params(struct AV1_COMP *cpi);
-void av1_configure_buffer_updates_firstpass(struct AV1_COMP *cpi,
-                                            FRAME_UPDATE_TYPE update_type);
+void av1_rc_get_second_pass_params(
+    struct AV1_COMP *cpi, struct EncodeFrameParams *const frame_params);

 // Post encode update of the rate control parameters for 2-pass
 void av1_twopass_postencode_update(struct AV1_COMP *cpi);
@@ -71,8 +71,8 @@ static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv,
  xd->mi[0]->mv[0] = x->best_mv;
  xd->mi[0]->ref_frame[1] = NONE_FRAME;

-  av1_build_inter_predictors_sby(&cpi->common, xd, mb_row, mb_col, NULL,
-                                 BLOCK_16X16);
+  av1_enc_build_inter_predictor(&cpi->common, xd, mb_row, mb_col, NULL,
+                                BLOCK_16X16, AOM_PLANE_Y, AOM_PLANE_Y);

  /* restore UMV window */
  x->mv_limits = tmp_mv_limits;
@@ -336,7 +336,7 @@ static unsigned int setup_center_error(
    int *mvcost[2], unsigned int *sse1, int *distortion) {
  unsigned int besterr;
  if (second_pred != NULL) {
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (is_cur_buf_hbd(xd)) {
      DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
      uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16);
      if (mask) {
@@ -641,7 +641,7 @@ static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *const cm,
                                int mask_stride, int invert_mask, int w, int h,
                                unsigned int *sse, int subpel_search) {
  unsigned int besterr;
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
    if (second_pred != NULL) {
@@ -899,7 +899,8 @@ unsigned int av1_compute_motion_cost(const AV1_COMP *cpi, MACROBLOCK *const x,
  unsigned int mse;
  unsigned int sse;

-  av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                AOM_PLANE_Y, AOM_PLANE_Y);
  mse = vfp->vf(dst, dst_stride, src, src_stride, &sse);
  mse += mv_err_cost(this_mv, &ref_mv.as_mv, x->nmv_vec_cost, x->mv_cost_stack,
                     x->errorperbit);
@@ -2213,9 +2214,8 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                  : av1_get_ref_frame_hash_map(&cpi->common,
                                               x->e_mbd.mi[0]->ref_frame[0]);

-        av1_get_block_hash_value(
-            what, what_stride, block_width, &hash_value1, &hash_value2,
-            x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, x);
+        av1_get_block_hash_value(what, what_stride, block_width, &hash_value1,
+                                 &hash_value2, is_cur_buf_hbd(&x->e_mbd), x);

        const int count = av1_hash_table_count(ref_frame_hash, hash_value1);
        // for intra, at lest one matching can be found, itself.
@@ -2334,7 +2334,7 @@ static int upsampled_obmc_pref_error(
  unsigned int besterr;

  DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
    aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h,
                              subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd,
@@ -3784,6 +3784,77 @@ static const NN_CONFIG simple_motion_search_prune_part_nn_config_8 = {
 #undef NUM_LAYER_0_UNITS_8
 #undef NUM_LOGITS_8

+#define FEATURE_SIZE 19
+static const float two_pass_split_partition_weights_128[FEATURE_SIZE + 1] = {
+  2.683936f, -0.193620f, -4.106470f, -0.141320f, -0.282289f,
+  0.125296f, -1.134961f, 0.862757f,  -0.418799f, -0.637666f,
+  0.016232f, 0.345013f,  0.018823f,  -0.393394f, -1.130700f,
+  0.695357f, 0.112569f,  -0.341975f, -0.513882f, 5.7488966f,
+};
+
+static const float two_pass_split_partition_weights_64[FEATURE_SIZE + 1] = {
+  2.990993f,  0.423273f,  -0.926544f, 0.454646f,  -0.292698f,
+  -1.311632f, -0.284432f, 0.717141f,  -0.419257f, -0.574760f,
+  -0.674444f, 0.669047f,  -0.374255f, 0.380624f,  -0.804036f,
+  0.264021f,  0.004163f,  1.896802f,  0.924287f,  0.13490619f,
+};
+
+static const float two_pass_split_partition_weights_32[FEATURE_SIZE + 1] = {
+  2.795181f,  -0.136943f, -0.924842f, 0.405330f,  -0.463505f,
+  -0.584076f, -0.831472f, 0.382985f,  -0.597544f, -0.138915f,
+  -1.354350f, 0.466035f,  -0.553961f, 0.213202f,  -1.166429f,
+  0.010776f,  -0.096236f, 2.335084f,  1.699857f,  -0.58178353f,
+};
+
+static const float two_pass_split_partition_weights_16[FEATURE_SIZE + 1] = {
+  1.987888f,  -0.431100f, -1.687703f, 0.262602f,  -0.425298f,
+  -0.463870f, -1.493457f, 0.470917f,  -0.528457f, -0.087700f,
+  -1.815092f, 0.152883f,  -0.337908f, 0.093679f,  -1.548267f,
+  -0.042387f, -0.000861f, 2.556746f,  1.619192f,  0.03643292f,
+};
+
+static const float two_pass_split_partition_weights_8[FEATURE_SIZE + 1] = {
+  2.188344f,  -0.817528f, -2.119219f, 0.000000f,  -0.348167f,
+  -0.658074f, -1.960362f, 0.000000f,  -0.403080f, 0.282699f,
+  -2.061088f, 0.000000f,  -0.431919f, -0.127960f, -1.099550f,
+  0.000000f,  0.121622f,  2.017455f,  2.058228f,  -0.15475988f,
+};
+
+static const float two_pass_none_partition_weights_128[FEATURE_SIZE + 1] = {
+  -1.006689f, 0.777908f,  4.461072f,  -0.395782f, -0.014610f,
+  -0.853863f, 0.729997f,  -0.420477f, 0.282429f,  -1.194595f,
+  3.181220f,  -0.511416f, 0.117084f,  -1.149348f, 1.507990f,
+  -0.477212f, 0.202963f,  -1.469581f, 0.624461f,  -0.89081228f,
+};
+
+static const float two_pass_none_partition_weights_64[FEATURE_SIZE + 1] = {
+  -1.241117f, 0.844878f,  5.638803f,  -0.489780f, -0.108796f,
+  -4.576821f, 1.540624f,  -0.477519f, 0.227791f,  -1.443968f,
+  1.586911f,  -0.505125f, 0.140764f,  -0.464194f, 1.466658f,
+  -0.641166f, 0.195412f,  1.427905f,  2.080007f,  -1.98272777f,
+};
+
+static const float two_pass_none_partition_weights_32[FEATURE_SIZE + 1] = {
+  -2.130825f, 0.476023f,  5.907343f,  -0.516002f, -0.097471f,
+  -2.662754f, 0.614858f,  -0.576728f, 0.085261f,  -0.031901f,
+  0.727842f,  -0.600034f, 0.079326f,  0.324328f,  0.504502f,
+  -0.547105f, -0.037670f, 0.304995f,  0.369018f,  -2.66299987f,
+};
+
+static const float two_pass_none_partition_weights_16[FEATURE_SIZE + 1] = {
+  -1.626410f, 0.872047f,  5.414965f,  -0.554781f, -0.084514f,
+  -3.020550f, 0.467632f,  -0.382280f, 0.199568f,  0.426220f,
+  0.829426f,  -0.467100f, 0.153098f,  0.662994f,  0.327545f,
+  -0.560106f, -0.141610f, 0.403372f,  0.523991f,  -3.02891231f,
+};
+
+static const float two_pass_none_partition_weights_8[FEATURE_SIZE + 1] = {
+  -1.463349f, 0.375376f,  4.751430f, 0.000000f, -0.184451f,
+  -1.655447f, 0.443214f,  0.000000f, 0.127961f, 0.152435f,
+  0.083288f,  0.000000f,  0.143105f, 0.438012f, 0.073238f,
+  0.000000f,  -0.278137f, 0.186134f, 0.073737f, -1.6494962f,
+};
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
@@ -29,6 +29,7 @@
 #include "av1/common/seg_common.h"

 #include "av1/encoder/encodemv.h"
+#include "av1/encoder/encode_strategy.h"
 #include "av1/encoder/random.h"
 #include "av1/encoder/ratectrl.h"

@@ -558,13 +559,11 @@ static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
                            arfgf_low_motion_minq, arfgf_high_motion_minq);
 }

-#if REDUCE_LAST_ALT_BOOST
 static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) {
  int *arfgf_high_motion_minq;
  ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
  return arfgf_high_motion_minq[q];
 }
-#endif

 static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) {
  const RATE_CONTROL *const rc = &cpi->rc;
@@ -965,12 +964,8 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
  const int bit_depth = cm->seq_params.bit_depth;
  ASSIGN_MINQ_TABLE(bit_depth, inter_minq);

-#if CUSTOMIZED_GF
  const int is_intrl_arf_boost =
      gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE;
-#else
-  const int is_intrl_arf_boost = cpi->refresh_alt2_ref_frame;
-#endif  // CUSTOMIZED_GF

  if (frame_is_intra_only(cm)) {
    if (rc->frames_to_key == 1 && oxcf->rc_mode == AOM_Q) {
@@ -1053,17 +1048,14 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
      // Constrained quality use slightly lower active best.
      active_best_quality = active_best_quality * 15 / 16;

-#if USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
      if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
          (is_intrl_arf_boost && !cpi->new_bwdref_update_rule)) {
-#if REDUCE_LAST_ALT_BOOST
        if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
          const int min_boost = get_gf_high_motion_quality(q, bit_depth);
          const int boost = min_boost - active_best_quality;

          active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
        }
-#endif  // REDUCE_LAST_ALT_BOOST
        *arf_q = active_best_quality;
      } else if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
        assert(rc->arf_q >= 0);  // Ensure it is set to a valid value.
@@ -1074,7 +1066,6 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
          ++this_height;
        }
      }
-#endif  // USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
    } else if (oxcf->rc_mode == AOM_Q) {
      if (!cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) {
        active_best_quality = cq_level;
@@ -1082,17 +1073,14 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
        if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
          active_best_quality = get_gf_active_quality(rc, q, bit_depth);
          *arf_q = active_best_quality;
-#if REDUCE_LAST_ALT_BOOST
          const int min_boost = get_gf_high_motion_quality(q, bit_depth);
          const int boost = min_boost - active_best_quality;

          active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
-#endif
        } else {
          assert(rc->arf_q >= 0);  // Ensure it is set to a valid value.
          active_best_quality = rc->arf_q;
        }
-#if USE_SYMM_MULTI_LAYER
        if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
          int this_height = gf_group_pyramid_level(cpi);
          while (this_height < gf_group->pyramid_height) {
@@ -1100,24 +1088,18 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
            ++this_height;
          }
        } else {
-#endif
          // Modify best quality for second level arfs. For mode AOM_Q this
          // becomes the baseline frame q.
          if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
            active_best_quality = (active_best_quality + cq_level + 1) / 2;
-#if USE_SYMM_MULTI_LAYER
        }
-#endif
      }
    } else {
      active_best_quality = get_gf_active_quality(rc, q, bit_depth);
-#if REDUCE_LAST_ALT_BOOST
      const int min_boost = get_gf_high_motion_quality(q, bit_depth);
      const int boost = min_boost - active_best_quality;

      active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
-#endif
-#if USE_SYMM_MULTI_LAYER
      if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
        int this_height = gf_group_pyramid_level(cpi);
        while (this_height < gf_group->pyramid_height) {
@@ -1126,7 +1108,6 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
          ++this_height;
        }
      }
-#endif
    }
  } else {
    if (oxcf->rc_mode == AOM_Q) {
@@ -1293,16 +1274,12 @@ static void update_alt_ref_frame_stats(AV1_COMP *cpi) {

 static void update_golden_frame_stats(AV1_COMP *cpi) {
  RATE_CONTROL *const rc = &cpi->rc;
-#if CUSTOMIZED_GF
  const TWO_PASS *const twopass = &cpi->twopass;
  const GF_GROUP *const gf_group = &twopass->gf_group;
  const int is_intrnl_arf =
      cpi->oxcf.pass == 2
          ? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE
          : cpi->refresh_alt2_ref_frame;
-#else
-  const int is_intnl_arf = cpi->refresh_alt2_ref_frame;
-#endif

  // Update the Golden frame usage counts.
  // NOTE(weitinglin): If we use show_existing_frame for an OVERLAY frame,
@@ -1328,127 +1305,6 @@ static void update_golden_frame_stats(AV1_COMP *cpi) {
  }
 }

-// Define the reference buffers that will be updated post encode.
-void av1_configure_buffer_updates(AV1_COMP *cpi) {
-  TWO_PASS *const twopass = &cpi->twopass;
-
-  // NOTE(weitinglin): Should we define another function to take care of
-  // cpi->rc.is_$Source_Type to make this function as it is in the comment?
-
-  cpi->rc.is_src_frame_alt_ref = 0;
-  cpi->rc.is_bwd_ref_frame = 0;
-  cpi->rc.is_last_bipred_frame = 0;
-  cpi->rc.is_bipred_frame = 0;
-  cpi->rc.is_src_frame_ext_arf = 0;
-
-  switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
-    case KF_UPDATE:
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 1;
-      cpi->refresh_bwd_ref_frame = 1;
-      cpi->refresh_alt2_ref_frame = 1;
-      cpi->refresh_alt_ref_frame = 1;
-      break;
-
-    case LF_UPDATE:
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-      break;
-
-    case GF_UPDATE:
-      // TODO(zoeliu): To further investigate whether 'refresh_last_frame' is
-      //               needed.
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 1;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-      break;
-
-    case OVERLAY_UPDATE:
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 1;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-
-      cpi->rc.is_src_frame_alt_ref = 1;
-      break;
-
-    case ARF_UPDATE:
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-      // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 1;
-      break;
-
-    case BRF_UPDATE:
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 1;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-
-      cpi->rc.is_bwd_ref_frame = 1;
-      break;
-
-    case LAST_BIPRED_UPDATE:
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-
-      cpi->rc.is_last_bipred_frame = 1;
-      break;
-
-    case BIPRED_UPDATE:
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-
-      cpi->rc.is_bipred_frame = 1;
-      break;
-
-    case INTNL_OVERLAY_UPDATE:
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_bwd_ref_frame = 0;
-      cpi->refresh_alt2_ref_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-
-      cpi->rc.is_src_frame_alt_ref = 1;
-      cpi->rc.is_src_frame_ext_arf = 1;
-      break;
-
-    case INTNL_ARF_UPDATE:
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-#if USE_SYMM_MULTI_LAYER
-      if (cpi->new_bwdref_update_rule == 1) {
-        cpi->refresh_bwd_ref_frame = 1;
-        cpi->refresh_alt2_ref_frame = 0;
-      } else {
-#endif
-        cpi->refresh_bwd_ref_frame = 0;
-        cpi->refresh_alt2_ref_frame = 1;
-#if USE_SYMM_MULTI_LAYER
-      }
-#endif
-      cpi->refresh_alt_ref_frame = 0;
-      break;
-
-    default: assert(0); break;
-  }
-}
-
 void av1_estimate_qp_gop(AV1_COMP *cpi) {
  AV1_COMMON *const cm = &cpi->common;
  int gop_length = cpi->rc.baseline_gf_interval;
@@ -1463,30 +1319,28 @@ void av1_estimate_qp_gop(AV1_COMP *cpi) {

    cpi->twopass.gf_group.index = idx;
    rc_set_frame_target(cpi, target_rate, cm->width, cm->height);
-    av1_configure_buffer_updates(cpi);
+    av1_configure_buffer_updates(
+        cpi, cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index]);
    tpl_frame->base_qindex = rc_pick_q_and_bounds_two_pass(
        cpi, cm->width, cm->height, &bottom_index, &top_index, &arf_q);
    tpl_frame->base_qindex = AOMMAX(tpl_frame->base_qindex, 1);
  }
  // Reset the actual index and frame update
  cpi->twopass.gf_group.index = gf_index;
-  av1_configure_buffer_updates(cpi);
+  av1_configure_buffer_updates(
+      cpi, cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index]);
 }

 void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
  const AV1_COMMON *const cm = &cpi->common;
  const CurrentFrame *const current_frame = &cm->current_frame;
  RATE_CONTROL *const rc = &cpi->rc;
-#if CUSTOMIZED_GF
  const TWO_PASS *const twopass = &cpi->twopass;
  const GF_GROUP *const gf_group = &twopass->gf_group;
  const int is_intrnl_arf =
      cpi->oxcf.pass == 2
          ? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE
          : cpi->refresh_alt2_ref_frame;
-#else
-  const int is_intrnl_arf = cpi->refresh_alt2_ref_frame;
-#endif

  const int qindex = cm->base_qindex;

@@ -1618,7 +1472,8 @@ static int calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
  return av1_rc_clamp_iframe_target_size(cpi, target);
 }

-void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
+void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi,
+                                    EncodeFrameParams *const frame_params) {
  AV1_COMMON *const cm = &cpi->common;
  RATE_CONTROL *const rc = &cpi->rc;
  CurrentFrame *const current_frame = &cm->current_frame;
@@ -1632,44 +1487,41 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
      (current_frame->frame_number == 0 ||
       (cpi->frame_flags & FRAMEFLAGS_KEY) || rc->frames_to_key == 0 ||
       (cpi->oxcf.auto_key && 0))) {
-    current_frame->frame_type = KEY_FRAME;
+    frame_params->frame_type = KEY_FRAME;
    rc->this_key_frame_forced =
        current_frame->frame_number != 0 && rc->frames_to_key == 0;
    rc->frames_to_key = cpi->oxcf.key_freq;
    rc->kf_boost = DEFAULT_KF_BOOST;
    rc->source_alt_ref_active = 0;
  } else {
-    current_frame->frame_type = INTER_FRAME;
+    frame_params->frame_type = INTER_FRAME;
    if (sframe_enabled) {
      if (altref_enabled) {
        if (sframe_mode == 1) {
          // sframe_mode == 1: insert sframe if it matches altref frame.

          if (current_frame->frame_number % sframe_dist == 0 &&
-              current_frame->frame_type != KEY_FRAME &&
              current_frame->frame_number != 0 && cpi->refresh_alt_ref_frame) {
-            current_frame->frame_type = S_FRAME;
+            frame_params->frame_type = S_FRAME;
          }
        } else {
          // sframe_mode != 1: if sframe will be inserted at the next available
          // altref frame

          if (current_frame->frame_number % sframe_dist == 0 &&
-              current_frame->frame_type != KEY_FRAME &&
              current_frame->frame_number != 0) {
            rc->sframe_due = 1;
          }

          if (rc->sframe_due && cpi->refresh_alt_ref_frame) {
-            current_frame->frame_type = S_FRAME;
+            frame_params->frame_type = S_FRAME;
            rc->sframe_due = 0;
          }
        }
      } else {
        if (current_frame->frame_number % sframe_dist == 0 &&
-            current_frame->frame_type != KEY_FRAME &&
            current_frame->frame_number != 0) {
-          current_frame->frame_type = S_FRAME;
+          frame_params->frame_type = S_FRAME;
        }
      }
    }
@@ -1692,7 +1544,7 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
    av1_cyclic_refresh_update_parameters(cpi);

-  if (current_frame->frame_type == KEY_FRAME)
+  if (frame_params->frame_type == KEY_FRAME)
    target = calc_iframe_target_size_one_pass_vbr(cpi);
  else
    target = calc_pframe_target_size_one_pass_vbr(cpi);
@@ -1758,7 +1610,8 @@ static int calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
  return av1_rc_clamp_iframe_target_size(cpi, target);
 }

-void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi) {
+void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi,
+                                    EncodeFrameParams *const frame_params) {
  AV1_COMMON *const cm = &cpi->common;
  RATE_CONTROL *const rc = &cpi->rc;
  CurrentFrame *const current_frame = &cm->current_frame;
@@ -1767,14 +1620,14 @@ void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi) {
  if ((current_frame->frame_number == 0 ||
       (cpi->frame_flags & FRAMEFLAGS_KEY) || rc->frames_to_key == 0 ||
       (cpi->oxcf.auto_key && 0))) {
-    current_frame->frame_type = KEY_FRAME;
+    frame_params->frame_type = KEY_FRAME;
    rc->this_key_frame_forced =
        current_frame->frame_number != 0 && rc->frames_to_key == 0;
    rc->frames_to_key = cpi->oxcf.key_freq;
    rc->kf_boost = DEFAULT_KF_BOOST;
    rc->source_alt_ref_active = 0;
  } else {
-    current_frame->frame_type = INTER_FRAME;
+    frame_params->frame_type = INTER_FRAME;
  }
  if (rc->frames_till_gf_update_due == 0) {
    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
@@ -1795,7 +1648,7 @@ void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi) {
  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
    av1_cyclic_refresh_update_parameters(cpi);

-  if (current_frame->frame_type == KEY_FRAME)
+  if (frame_params->frame_type == KEY_FRAME)
    target = calc_iframe_target_size_one_pass_cbr(cpi);
  else
    target = calc_pframe_target_size_one_pass_cbr(cpi);
@@ -34,27 +34,10 @@ extern "C" {
 // The maximum duration of a GF group that is static (e.g. a slide show).
 #define MAX_STATIC_GF_GROUP_LENGTH 250

-#define CUSTOMIZED_GF 1
-
-#if CONFIG_FIX_GF_LENGTH
 // Minimum and maximum height for the new pyramid structure.
 // (Old structure supports height = 1, but does NOT support height = 4).
 #define MIN_PYRAMID_LVL 2
 #define MAX_PYRAMID_LVL 4
-#define USE_SYMM_MULTI_LAYER 1
-#define REDUCE_LAST_ALT_BOOST 1
-#define REDUCE_LAST_GF_LENGTH 1
-#define MULTI_LVL_BOOST_VBR_CQ 1
-#else
-#define USE_SYMM_MULTI_LAYER 0
-#define REDUCE_LAST_ALT_BOOST 0
-#define REDUCE_LAST_GF_LENGTH 0
-#define MULTI_LVL_BOOST_VBR_CQ 0
-#endif
-
-#if USE_SYMM_MULTI_LAYER
-#define USE_MANUAL_GF4_STRUCT 0
-#endif

 #define MIN_GF_INTERVAL 4
 #define MAX_GF_INTERVAL 16
@@ -191,9 +174,7 @@ int av1_rc_get_default_min_gf_interval(int width, int height, double framerate);
 int av1_rc_get_default_max_gf_interval(double framerate, int min_frame_rate,
                                       int max_pyr_height);

-#if CONFIG_FIX_GF_LENGTH
 int av1_rc_get_fixed_gf_length(int max_pyr_height);
-#endif  // CONFIG_FIX_GF_LENGTH

 // Generally at the high level, the following flow is expected
 // to be enforced for rate control:
@@ -218,8 +199,11 @@ int av1_rc_get_fixed_gf_length(int max_pyr_height);

 // Functions to set parameters for encoding before the actual
 // encode_frame_to_data_rate() function.
-void av1_rc_get_one_pass_vbr_params(struct AV1_COMP *cpi);
-void av1_rc_get_one_pass_cbr_params(struct AV1_COMP *cpi);
+struct EncodeFrameParams;
+void av1_rc_get_one_pass_vbr_params(
+    struct AV1_COMP *cpi, struct EncodeFrameParams *const frame_params);
+void av1_rc_get_one_pass_cbr_params(
+    struct AV1_COMP *cpi, struct EncodeFrameParams *const frame_params);

 // Post encode update of the rate control parameters based
 // on bytes used
@@ -283,8 +267,6 @@ void av1_set_target_rate(struct AV1_COMP *cpi, int width, int height);

 int av1_resize_one_pass_cbr(struct AV1_COMP *cpi);

-void av1_configure_buffer_updates(struct AV1_COMP *cpi);
-
 void av1_estimate_qp_gop(struct AV1_COMP *cpi);

 #ifdef __cplusplus
@@ -508,6 +508,17 @@ void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
        av1_cost_tokens_from_cdf(pcost->base_cost[ctx],
                                 fc->coeff_base_cdf[tx_size][plane][ctx], NULL);

+      for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
+        pcost->base_cost[ctx][4] = 0;
+        pcost->base_cost[ctx][5] = pcost->base_cost[ctx][1] +
+                                   av1_cost_literal(1) -
+                                   pcost->base_cost[ctx][0];
+        pcost->base_cost[ctx][6] =
+            pcost->base_cost[ctx][2] - pcost->base_cost[ctx][1];
+        pcost->base_cost[ctx][7] =
+            pcost->base_cost[ctx][3] - pcost->base_cost[ctx][2];
+      }
+
      for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx)
        av1_cost_tokens_from_cdf(pcost->eob_extra_cost[ctx],
                                 fc->eob_extra_cdf[tx_size][plane][ctx], NULL);
@@ -538,6 +549,14 @@ void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
        //  printf("%5d ", pcost->lps_cost[ctx][i]);
        // printf("\n");
      }
+      for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
+        pcost->lps_cost[ctx][0 + COEFF_BASE_RANGE + 1] =
+            pcost->lps_cost[ctx][0];
+        for (int i = 1; i <= COEFF_BASE_RANGE; ++i) {
+          pcost->lps_cost[ctx][i + COEFF_BASE_RANGE + 1] =
+              pcost->lps_cost[ctx][i] - pcost->lps_cost[ctx][i - 1];
+        }
+      }
    }
  }
 }
@@ -698,6 +717,10 @@ static const uint8_t bsize_model_cat_lookup[BLOCK_SIZES_ALL] = {
  0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2
 };

+static int sse_norm_model_cat_lookup(double sse_norm) {
+  return (sse_norm > 16.0);
+}
+
 static const double interp_rgrid_surf[4][33 * 18] = {
  {
      29.726102,   30.738006,   25.294088,   25.736759,   41.255961,
@@ -1273,8 +1296,9 @@ static const double interp_dgrid_surf[33 * 18] = {
  0.007205,  0.007205,  0.007203,  0.004341,  0.004340,  0.004338,
 };

-void av1_model_rd_surffit(BLOCK_SIZE bsize, double xm, double yl,
-                          double *rate_f, double *dist_f) {
+void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm,
+                          double yl, double *rate_f, double *dist_f) {
+  (void)sse_norm;
  const double x_start = -0.5;
  const double x_end = 16.5;
  const double x_step = 1.0;
@@ -1283,7 +1307,7 @@ void av1_model_rd_surffit(BLOCK_SIZE bsize, double xm, double yl,
  const double y_step = 1.0;
  const double epsilon = 1e-6;
  const int stride = (int)rint((x_end - x_start) / x_step) + 1;
-  const int cat = bsize_model_cat_lookup[bsize];
+  const int rcat = bsize_model_cat_lookup[bsize];
  (void)y_end;

  xm = AOMMAX(xm, x_start + x_step + epsilon);
@@ -1301,7 +1325,7 @@ void av1_model_rd_surffit(BLOCK_SIZE bsize, double xm, double yl,

  const double yo = y - yi;
  const double xo = x - xi;
-  const double *prate = &interp_rgrid_surf[cat][(yi - 1) * stride + (xi - 1)];
+  const double *prate = &interp_rgrid_surf[rcat][(yi - 1) * stride + (xi - 1)];
  const double *pdist = &interp_dgrid_surf[(yi - 1) * stride + (xi - 1)];
  *rate_f = interp_bicubic(prate, stride, xo, yo);
  *dist_f = interp_bicubic(pdist, stride, xo, yo);
@@ -1311,85 +1335,102 @@ static const double interp_rgrid_curv[4][65] = {
  {
      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
-      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    23.801499,   28.387688,   33.388795,   42.298282,
      41.525408,   51.597692,   49.566271,   54.632979,   60.321507,
      67.730678,   75.766165,   85.324032,   96.600012,   120.839562,
      173.917577,  255.974908,  354.107573,  458.063476,  562.345966,
      668.568424,  772.072881,  878.598490,  982.202274,  1082.708946,
      1188.037853, 1287.702240, 1395.588773, 1490.825830, 1584.231230,
      1691.386090, 1766.822555, 1869.630904, 1926.743565, 2002.949495,
-      2047.431137, 2138.486068, 2154.743767, 2209.242472, 2278.252010,
-      2298.028834, 2302.326180, 2293.979995, 2275.826226, 2250.700821,
-      2221.439725, 2190.878887, 2161.854252, 2137.201768, 2119.757381,
-      2112.357039, 2117.836689, 2139.032277, 2178.779750, 2239.915056,
-  },
-  {
-      0.000000,     0.000000,     0.000000,     0.000000,     0.000000,
-      0.000000,     0.000000,     0.000000,     0.000000,     0.000000,
-      0.000000,     0.000000,     0.000000,     0.000000,     0.000000,
-      11.561347,    12.578139,    14.205101,    16.770584,    19.094853,
-      21.330863,    23.298907,    26.901921,    34.501017,    57.891733,
-      112.234763,   194.853189,   288.302032,   380.499422,   472.625309,
-      560.226809,   647.928463,   734.155122,   817.489721,   906.265783,
-      999.260562,   1094.489206,  1197.062998,  1293.296825,  1378.926484,
-      1472.760990,  1552.663779,  1635.196884,  1692.451951,  1759.741063,
-      1822.162720,  1916.515921,  1966.686071,  2031.647506,  2031.381029,
-      2067.971335,  2203.662704,  2500.257936,  3019.559830,  3823.371186,
-      4973.494802,  6531.733478,  8559.890013,  11119.767206, 14273.167855,
-      18081.894761, 22607.750723, 27912.538538, 34058.061008, 41106.120930,
+      2047.431137, 2138.486068, 2154.743767, 2209.242472, 2277.593051,
+      2290.996432, 2307.452938, 2343.567091, 2397.654644, 2469.425868,
+      2558.591037, 2664.860422, 2787.944296, 2927.552932, 3083.396602,
+      3255.185579, 3442.630134, 3645.440541, 3863.327072, 4096.000000,
  },
  {
      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    8.998436,    9.439592,    9.731837,    10.865931,
+      11.561347,   12.578139,   14.205101,   16.770584,   19.094853,
+      21.330863,   23.298907,   26.901921,   34.501017,   57.891733,
+      112.234763,  194.853189,  288.302032,  380.499422,  472.625309,
+      560.226809,  647.928463,  734.155122,  817.489721,  906.265783,
+      999.260562,  1094.489206, 1197.062998, 1293.296825, 1378.926484,
+      1472.760990, 1552.663779, 1635.196884, 1692.451951, 1759.741063,
+      1822.162720, 1916.515921, 1966.686071, 2031.647506, 2033.700134,
+      2087.847688, 2161.688858, 2242.536028, 2334.023491, 2436.337802,
+      2549.665519, 2674.193198, 2810.107395, 2957.594666, 3116.841567,
+      3288.034655, 3471.360486, 3667.005616, 3875.156602, 4096.000000,
+  },
+  {
      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    2.377584,    2.557185,    2.732445,    2.851114,
      3.281800,    3.765589,    4.342578,    5.145582,    5.611038,
      6.642238,    7.945977,    11.800522,   17.346624,   37.501413,
      87.216800,   165.860942,  253.865564,  332.039345,  408.518863,
      478.120452,  547.268590,  616.067676,  680.022540,  753.863541,
      834.529973,  919.489191,  1008.264989, 1092.230318, 1173.971886,
      1249.514122, 1330.510941, 1399.523249, 1466.923387, 1530.533471,
-      1586.515722, 1695.197774, 1746.648696, 1837.136959, 1909.056910,
-      1974.948082, 2063.374132, 2178.496387, 2324.476176, 2505.474827,
-      2725.653666, 2989.174023, 3300.197225, 3662.884600, 4081.397476,
-      4559.897180, 5102.545042, 5713.502387, 6396.930546, 7156.990844,
+      1586.515722, 1695.197774, 1746.648696, 1837.136959, 1909.075485,
+      1975.074651, 2060.159200, 2155.335095, 2259.762505, 2373.710437,
+      2497.447898, 2631.243895, 2775.367434, 2930.087523, 3095.673170,
+      3272.393380, 3460.517161, 3660.313520, 3872.051464, 4096.000000,
  },
  {
-      0.000000,     0.000000,     0.000000,     0.000000,     0.000000,
-      0.000000,     0.000000,     0.000000,     0.000000,     0.000000,
-      0.000000,     0.000000,     0.000000,     0.000000,     0.000000,
-      0.614483,     0.842937,     1.050824,     1.326663,     1.717750,
-      2.530591,     3.582302,     6.995373,     9.973335,     24.042464,
-      56.598240,    113.680735,   180.018689,   231.050567,   266.101082,
-      294.957934,   323.326511,   349.434429,   380.443211,   408.171987,
-      441.214916,   475.716772,   512.900000,   551.186939,   592.364455,
-      624.527378,   661.940693,   679.185473,   724.800679,   764.781792,
-      873.050019,   950.299001,   939.292954,   1052.406153,  1030.816617,
-      1086.316710,  1275.467594,  1671.923018,  2349.336727,  3381.362469,
-      4841.653990,  6803.865037,  9341.649358,  12528.660698, 16438.552805,
-      21144.979426, 26721.594308, 33242.051197, 40780.003840, 49409.105984,
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    0.296997,    0.342545,    0.403097,    0.472889,
+      0.614483,    0.842937,    1.050824,    1.326663,    1.717750,
+      2.530591,    3.582302,    6.995373,    9.973335,    24.042464,
+      56.598240,   113.680735,  180.018689,  231.050567,  266.101082,
+      294.957934,  323.326511,  349.434429,  380.443211,  408.171987,
+      441.214916,  475.716772,  512.900000,  551.186939,  592.364455,
+      624.527378,  661.940693,  679.185473,  724.800679,  764.781792,
+      873.050019,  950.299001,  939.292954,  1052.406153, 1033.893184,
+      1112.182406, 1219.174326, 1337.296681, 1471.648357, 1622.492809,
+      1790.093491, 1974.713858, 2176.617364, 2396.067465, 2633.327614,
+      2888.661266, 3162.331876, 3454.602899, 3765.737789, 4096.000000,
  },
 };

-static const double interp_dgrid_curv[65] = {
-  14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855,
-  14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.555776, 14.533692,
-  14.439920, 14.257791, 13.977230, 13.623229, 13.064884, 12.355411, 11.560773,
-  10.728960, 9.861975,  8.643612,  6.916021,  5.154769,  3.734940,  2.680051,
-  1.925506,  1.408410,  1.042223,  0.767641,  0.565392,  0.420116,  0.310427,
-  0.231711,  0.172999,  0.128293,  0.094992,  0.072171,  0.052972,  0.039354,
-  0.029555,  0.022857,  0.016832,  0.013297,  0.000000,  0.000000,  0.000000,
-  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,
-  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,
-  0.000000,  0.000000,
+static const double interp_dgrid_curv[2][65] = {
+  {
+      16.000000, 15.962891, 15.925174, 15.886888, 15.848074, 15.808770,
+      15.769015, 15.728850, 15.688313, 15.647445, 15.606284, 15.564870,
+      15.525918, 15.483820, 15.373330, 15.126844, 14.637442, 14.184387,
+      13.560070, 12.880717, 12.165995, 11.378144, 10.438769, 9.130790,
+      7.487633,  5.688649,  4.267515,  3.196300,  2.434201,  1.834064,
+      1.369920,  1.035921,  0.775279,  0.574895,  0.427232,  0.314123,
+      0.233236,  0.171440,  0.128188,  0.092762,  0.067569,  0.049324,
+      0.036330,  0.027008,  0.019853,  0.015539,  0.011093,  0.008733,
+      0.007624,  0.008105,  0.005427,  0.004065,  0.003427,  0.002848,
+      0.002328,  0.001865,  0.001457,  0.001103,  0.000801,  0.000550,
+      0.000348,  0.000193,  0.000085,  0.000021,  0.000000,
+  },
+  {
+      16.000000, 15.996116, 15.984769, 15.966413, 15.941505, 15.910501,
+      15.873856, 15.832026, 15.785466, 15.734633, 15.679981, 15.621967,
+      15.560961, 15.460157, 15.288367, 15.052462, 14.466922, 13.921212,
+      13.073692, 12.222005, 11.237799, 9.985848,  8.898823,  7.423519,
+      5.995325,  4.773152,  3.744032,  2.938217,  2.294526,  1.762412,
+      1.327145,  1.020728,  0.765535,  0.570548,  0.425833,  0.313825,
+      0.232959,  0.171324,  0.128174,  0.092750,  0.067558,  0.049319,
+      0.036330,  0.027008,  0.019853,  0.015539,  0.011093,  0.008733,
+      0.007624,  0.008105,  0.005427,  0.004065,  0.003427,  0.002848,
+      0.002328,  0.001865,  0.001457,  0.001103,  0.000801,  0.000550,
+      0.000348,  0.000193,  0.000085,  0.000021,  -0.000000,
+  },
 };

-void av1_model_rd_curvfit(BLOCK_SIZE bsize, double xqr, double *rate_f,
-                          double *distbysse_f) {
+void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
+                          double *rate_f, double *distbysse_f) {
  const double x_start = -15.5;
  const double x_end = 16.5;
  const double x_step = 0.5;
  const double epsilon = 1e-6;
-  const int cat = bsize_model_cat_lookup[bsize];
+  const int rcat = bsize_model_cat_lookup[bsize];
+  const int dcat = sse_norm_model_cat_lookup(sse_norm);
  (void)x_end;

  xqr = AOMMAX(xqr, x_start + x_step + epsilon);
@@ -1400,9 +1441,9 @@ void av1_model_rd_curvfit(BLOCK_SIZE bsize, double xqr, double *rate_f,

  assert(xi > 0);

-  const double *prate = &interp_rgrid_curv[cat][(xi - 1)];
-  const double *pdist = &interp_dgrid_curv[(xi - 1)];
+  const double *prate = &interp_rgrid_curv[rcat][(xi - 1)];
  *rate_f = interp_cubic(prate, xo);
+  const double *pdist = &interp_dgrid_curv[dcat][(xi - 1)];
  *distbysse_f = interp_cubic(pdist, xo);
 }

@@ -1565,7 +1606,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
  } else {
    rd->thresh_mult[THR_NEARESTMV] = 0;
    rd->thresh_mult[THR_NEARESTL2] = 0;
-    rd->thresh_mult[THR_NEARESTL3] = 0;
+    rd->thresh_mult[THR_NEARESTL3] = 100;
    rd->thresh_mult[THR_NEARESTB] = 0;
    rd->thresh_mult[THR_NEARESTA2] = 0;
    rd->thresh_mult[THR_NEARESTA] = 0;
@@ -1576,7 +1617,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
  rd->thresh_mult[THR_NEWL2] += 1000;
  rd->thresh_mult[THR_NEWL3] += 1000;
  rd->thresh_mult[THR_NEWB] += 1000;
-  rd->thresh_mult[THR_NEWA2] = 1000;
+  rd->thresh_mult[THR_NEWA2] = 1100;
  rd->thresh_mult[THR_NEWA] += 1000;
  rd->thresh_mult[THR_NEWG] += 1000;

@@ -1588,18 +1629,18 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
  rd->thresh_mult[THR_NEARA] += 1000;
  rd->thresh_mult[THR_NEARG] += 1000;

-  rd->thresh_mult[THR_GLOBALMV] += 2000;
+  rd->thresh_mult[THR_GLOBALMV] += 2200;
  rd->thresh_mult[THR_GLOBALL2] += 2000;
  rd->thresh_mult[THR_GLOBALL3] += 2000;
-  rd->thresh_mult[THR_GLOBALB] += 2000;
+  rd->thresh_mult[THR_GLOBALB] += 2400;
  rd->thresh_mult[THR_GLOBALA2] = 2000;
  rd->thresh_mult[THR_GLOBALG] += 2000;
-  rd->thresh_mult[THR_GLOBALA] += 2000;
+  rd->thresh_mult[THR_GLOBALA] += 2400;

-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1100;
  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 1000;
-  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 800;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] += 900;
  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] += 1000;
  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] += 1000;
  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] += 1000;
@@ -1617,17 +1658,17 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
  rd->thresh_mult[THR_COMP_NEAR_NEARLA] += 1200;
  rd->thresh_mult[THR_COMP_NEAREST_NEWLA] += 1500;
  rd->thresh_mult[THR_COMP_NEW_NEARESTLA] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLA] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEARLA] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWLA] += 2000;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] += 2500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLA] += 1530;
+  rd->thresh_mult[THR_COMP_NEW_NEARLA] += 1870;
+  rd->thresh_mult[THR_COMP_NEW_NEWLA] += 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] += 2750;

  rd->thresh_mult[THR_COMP_NEAR_NEARL2A] += 1200;
  rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] += 1500;
  rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWL2A] += 1700;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2A] += 1870;
  rd->thresh_mult[THR_COMP_NEW_NEARL2A] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 1800;
  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A] += 2500;

  rd->thresh_mult[THR_COMP_NEAR_NEARL3A] += 1200;
@@ -1636,23 +1677,23 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
  rd->thresh_mult[THR_COMP_NEAR_NEWL3A] += 1700;
  rd->thresh_mult[THR_COMP_NEW_NEARL3A] += 1700;
  rd->thresh_mult[THR_COMP_NEW_NEWL3A] += 2000;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] += 3000;

-  rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1320;
  rd->thresh_mult[THR_COMP_NEAREST_NEWGA] += 1500;
  rd->thresh_mult[THR_COMP_NEW_NEARESTGA] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWGA] += 1700;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGA] += 2040;
  rd->thresh_mult[THR_COMP_NEW_NEARGA] += 1700;
  rd->thresh_mult[THR_COMP_NEW_NEWGA] += 2000;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] += 2250;

  rd->thresh_mult[THR_COMP_NEAR_NEARLB] += 1200;
  rd->thresh_mult[THR_COMP_NEAREST_NEWLB] += 1500;
  rd->thresh_mult[THR_COMP_NEW_NEARESTLB] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1700;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1360;
  rd->thresh_mult[THR_COMP_NEW_NEARLB] += 1700;
-  rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2000;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] += 2500;
+  rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] += 2250;

  rd->thresh_mult[THR_COMP_NEAR_NEARL2B] += 1200;
  rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] += 1500;
@@ -1665,7 +1706,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
  rd->thresh_mult[THR_COMP_NEAR_NEARL3B] += 1200;
  rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] += 1500;
  rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] += 1500;
-  rd->thresh_mult[THR_COMP_NEAR_NEWL3B] += 1700;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3B] += 1870;
  rd->thresh_mult[THR_COMP_NEW_NEARL3B] += 1700;
  rd->thresh_mult[THR_COMP_NEW_NEWL3B] += 2000;
  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3B] += 2500;
@@ -1679,7 +1720,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGB] += 2500;

  rd->thresh_mult[THR_COMP_NEAR_NEARLA2] += 1200;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] += 1500;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] += 1800;
  rd->thresh_mult[THR_COMP_NEW_NEARESTLA2] += 1500;
  rd->thresh_mult[THR_COMP_NEAR_NEWLA2] += 1700;
  rd->thresh_mult[THR_COMP_NEW_NEARLA2] += 1700;
@@ -1694,7 +1735,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
  rd->thresh_mult[THR_COMP_NEW_NEWL2A2] += 2000;
  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A2] += 2500;

-  rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] += 1440;
  rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] += 1500;
  rd->thresh_mult[THR_COMP_NEW_NEARESTL3A2] += 1500;
  rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] += 1700;
@@ -1708,29 +1749,29 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
  rd->thresh_mult[THR_COMP_NEAR_NEWGA2] += 1700;
  rd->thresh_mult[THR_COMP_NEW_NEARGA2] += 1700;
  rd->thresh_mult[THR_COMP_NEW_NEWGA2] += 2000;
-  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] += 2500;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] += 2750;

  rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1600;
  rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 2000;
  rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] += 2000;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLL2] += 2200;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLL2] += 2640;
  rd->thresh_mult[THR_COMP_NEW_NEARLL2] += 2200;
  rd->thresh_mult[THR_COMP_NEW_NEWLL2] += 2400;
  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL2] += 3200;

  rd->thresh_mult[THR_COMP_NEAR_NEARLL3] += 1600;
  rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] += 2000;
-  rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] += 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] += 1800;
  rd->thresh_mult[THR_COMP_NEAR_NEWLL3] += 2200;
  rd->thresh_mult[THR_COMP_NEW_NEARLL3] += 2200;
  rd->thresh_mult[THR_COMP_NEW_NEWLL3] += 2400;
  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL3] += 3200;

-  rd->thresh_mult[THR_COMP_NEAR_NEARLG] += 1600;
-  rd->thresh_mult[THR_COMP_NEAREST_NEWLG] += 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEARLG] += 1760;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLG] += 2400;
  rd->thresh_mult[THR_COMP_NEW_NEARESTLG] += 2000;
-  rd->thresh_mult[THR_COMP_NEAR_NEWLG] += 2200;
-  rd->thresh_mult[THR_COMP_NEW_NEARLG] += 2200;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLG] += 1760;
+  rd->thresh_mult[THR_COMP_NEW_NEARLG] += 2640;
  rd->thresh_mult[THR_COMP_NEW_NEWLG] += 2400;
  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLG] += 3200;

@@ -1738,21 +1779,21 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
  rd->thresh_mult[THR_COMP_NEAREST_NEWBA] += 2000;
  rd->thresh_mult[THR_COMP_NEW_NEARESTBA] += 2000;
  rd->thresh_mult[THR_COMP_NEAR_NEWBA] += 2200;
-  rd->thresh_mult[THR_COMP_NEW_NEARBA] += 2200;
-  rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2400;
+  rd->thresh_mult[THR_COMP_NEW_NEARBA] += 1980;
+  rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2640;
  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] += 3200;

  rd->thresh_mult[THR_DC] += 1000;
  rd->thresh_mult[THR_PAETH] += 1000;
-  rd->thresh_mult[THR_SMOOTH] += 2000;
+  rd->thresh_mult[THR_SMOOTH] += 2200;
  rd->thresh_mult[THR_SMOOTH_V] += 2000;
  rd->thresh_mult[THR_SMOOTH_H] += 2000;
  rd->thresh_mult[THR_H_PRED] += 2000;
-  rd->thresh_mult[THR_V_PRED] += 2000;
+  rd->thresh_mult[THR_V_PRED] += 1800;
  rd->thresh_mult[THR_D135_PRED] += 2500;
-  rd->thresh_mult[THR_D203_PRED] += 2500;
+  rd->thresh_mult[THR_D203_PRED] += 2000;
  rd->thresh_mult[THR_D157_PRED] += 2500;
-  rd->thresh_mult[THR_D67_PRED] += 2500;
+  rd->thresh_mult[THR_D67_PRED] += 2000;
  rd->thresh_mult[THR_D113_PRED] += 2500;
  rd->thresh_mult[THR_D45_PRED] += 2500;
 }
@@ -656,10 +656,10 @@ void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x,
 void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
                                  unsigned int qstep, int *rate, int64_t *dist);

-void av1_model_rd_curvfit(BLOCK_SIZE bsize, double xqr, double *rate_f,
-                          double *distbysse_f);
-void av1_model_rd_surffit(BLOCK_SIZE bsize, double xm, double yl,
+void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
                          double *rate_f, double *distbysse_f);
+void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm,
+                          double yl, double *rate_f, double *distbysse_f);

 int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
                            const MACROBLOCKD *xd);
@@ -151,10 +151,8 @@ typedef struct {

 sobel_xy sobel(const uint8_t *input, int stride, int i, int j, bool high_bd);

-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
 void av1_inter_mode_data_init(struct TileDataEnc *tile_data);
 void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult);
-#endif

 #ifdef __cplusplus
 }  // extern "C"
@@ -237,46 +237,19 @@ static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
  }
 }

-static void build_inter_predictors_for_planes(const AV1_COMMON *cm,
-                                              MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                                              int mi_row, int mi_col,
-                                              int plane_from, int plane_to) {
-  int plane;
+static void build_inter_predictors_for_plane(const AV1_COMMON *cm,
+                                             MACROBLOCKD *xd, int mi_row,
+                                             int mi_col, const BUFFER_SET *ctx,
+                                             BLOCK_SIZE bsize, int plane_idx) {
+  const struct macroblockd_plane *pd = &xd->plane[plane_idx];
+  if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                           pd->subsampling_y))
+    return;
+
  const int mi_x = mi_col * MI_SIZE;
  const int mi_y = mi_row * MI_SIZE;
-  for (plane = plane_from; plane <= plane_to; ++plane) {
-    const struct macroblockd_plane *pd = &xd->plane[plane];
-    const int bw = pd->width;
-    const int bh = pd->height;
-
-    if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
-                             pd->subsampling_y))
-      continue;
-
-    build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y);
-  }
-}
-
-void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                    int mi_row, int mi_col, BUFFER_SET *ctx,
-                                    BLOCK_SIZE bsize) {
-  av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize, 0);
-}
-
-void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col, BUFFER_SET *ctx,
-                                     BLOCK_SIZE bsize) {
-  for (int plane_idx = 1; plane_idx < MAX_MB_PLANE; plane_idx++) {
-    av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize,
-                                   plane_idx);
-  }
-}
-
-void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                    int mi_row, int mi_col, BUFFER_SET *ctx,
-                                    BLOCK_SIZE bsize, int plane_idx) {
-  build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, plane_idx,
-                                    plane_idx);
+  build_inter_predictors(cm, xd, plane_idx, xd->mi[0], 0, pd->width, pd->height,
+                         mi_x, mi_y);

  if (is_interintra_pred(xd->mi[0])) {
    BUFFER_SET default_ctx = { { NULL, NULL, NULL }, { 0, 0, 0 } };
@@ -291,13 +264,14 @@ void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
  }
 }

-void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                   int mi_row, int mi_col, BUFFER_SET *ctx,
-                                   BLOCK_SIZE bsize) {
-  const int num_planes = av1_num_planes(cm);
-  av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
-  if (num_planes > 1)
-    av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
+void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                   int mi_row, int mi_col,
+                                   const BUFFER_SET *ctx, BLOCK_SIZE bsize,
+                                   int plane_from, int plane_to) {
+  for (int plane_idx = plane_from; plane_idx <= plane_to; ++plane_idx) {
+    build_inter_predictors_for_plane(cm, xd, mi_row, mi_col, ctx, bsize,
+                                     plane_idx);
+  }
 }

 // TODO(sarahparker):
@@ -453,7 +427,7 @@ void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };

-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(xd)) {
    int len = sizeof(uint16_t);
    dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
    dst_buf1[1] =
@@ -576,37 +550,41 @@ static void build_wedge_inter_predictor_from_buf(
  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
  mbmi->interinter_comp.seg_mask = xd->seg_mask;
  const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
+  const int is_hbd = is_cur_buf_hbd(xd);

  if (is_compound && is_masked_compound_type(comp_data->type)) {
    if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      if (is_hbd) {
        av1_build_compound_diffwtd_mask_highbd(
            comp_data->seg_mask, comp_data->mask_type,
            CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
            CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
-      else
+      } else {
        av1_build_compound_diffwtd_mask(
            comp_data->seg_mask, comp_data->mask_type, ext_dst0,
            ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
+      }
    }

-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    if (is_hbd) {
      build_masked_compound_highbd(
          dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
          mbmi->sb_type, h, w, xd->bd);
-    else
+    } else {
      build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
                            ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
                            h, w);
+    }
  } else {
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    if (is_hbd) {
      aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
                               dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
                               xd->bd);
-    else
+    } else {
      aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
                        0, NULL, 0, w, h);
+    }
  }
 }

@@ -23,21 +23,10 @@
 extern "C" {
 #endif

-void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                    int mi_row, int mi_col, BUFFER_SET *ctx,
-                                    BLOCK_SIZE bsize);
-
-void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col, BUFFER_SET *ctx,
-                                     BLOCK_SIZE bsize);
-
-void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                    int mi_row, int mi_col, BUFFER_SET *ctx,
-                                    BLOCK_SIZE bsize, int plane_idx);
-
-void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                   int mi_row, int mi_col, BUFFER_SET *ctx,
-                                   BLOCK_SIZE bsize);
+void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                   int mi_row, int mi_col,
+                                   const BUFFER_SET *ctx, BLOCK_SIZE bsize,
+                                   int plane_from, int plane_to);

 void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
                               int dst_stride, const MV *src_mv,
@@ -80,7 +80,7 @@ static int frame_is_boosted(const AV1_COMP *cpi) {
 // partly on the screen area that over which they propogate. Propogation is
 // limited by transform block size but the screen area take up by a given block
 // size will be larger for a small image format stretched to full screen.
-static BLOCK_SIZE set_partition_min_limit(AV1_COMMON *const cm) {
+static BLOCK_SIZE set_partition_min_limit(const AV1_COMMON *const cm) {
  unsigned int screen_area = (cm->width * cm->height);

  // Select block size based on image format size.
@@ -103,10 +103,9 @@ static int has_internal_image_edge(const AV1_COMP *cpi) {
          (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
 }

-static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
-                                                       SPEED_FEATURES *sf,
-                                                       int speed) {
-  AV1_COMMON *const cm = &cpi->common;
+static void set_good_speed_feature_framesize_dependent(
+    const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+  const AV1_COMMON *const cm = &cpi->common;
  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;

@@ -201,11 +200,12 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
  }
 }

-static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
-                                                          SPEED_FEATURES *sf,
-                                                          int speed) {
-  AV1_COMMON *const cm = &cpi->common;
+static void set_good_speed_features_framesize_independent(
+    const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+  const AV1_COMMON *const cm = &cpi->common;
  const int boosted = frame_is_boosted(cpi);
+  const int is_boosted_arf2_bwd_type =
+      boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame;

  // Speed 0 for all speed features that give neutral coding performance change.
  sf->reduce_inter_modes = 1;
@@ -213,6 +213,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
  sf->ml_prune_rect_partition = 1;
  sf->ml_prune_ab_partition = 1;
  sf->ml_prune_4_partition = 1;
+  sf->simple_motion_search_prune_rect = 1;
  sf->adaptive_txb_search_level = 1;
  sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
  sf->model_based_prune_tx_search_level = 1;
@@ -222,8 +223,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
  // TODO(debargha): Test, tweak and turn on either 1 or 2
  sf->inter_mode_rd_model_estimation = 1;

-  sf->prune_ref_frame_for_rect_partitions =
-      !(boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame);
+  sf->prune_ref_frame_for_rect_partitions = !is_boosted_arf2_bwd_type;
  sf->prune_ref_mode_for_partitions = sf->prune_ref_frame_for_rect_partitions;
  sf->less_rectangular_check_level = 1;
  sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3;
@@ -270,7 +270,6 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
    sf->prune_single_motion_modes_by_simple_trans = 1;

    sf->simple_motion_search_split_only = 1;
-    sf->simple_motion_search_prune_rect = 1;

    sf->disable_wedge_search_var_thresh = 0;
    sf->disable_wedge_search_edge_thresh = 0;
@@ -322,7 +321,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
    // See aomedia:1778.
    // sf->adaptive_motion_search = 1;
    sf->recode_loop = ALLOW_RECODE_KFARFGF;
-    sf->use_transform_domain_distortion = 1;
+    sf->use_transform_domain_distortion = boosted ? 1 : 2;
    sf->use_accurate_subpel_search = USE_2_TAPS;
    sf->adaptive_rd_thresh = 2;
    sf->tx_type_search.prune_mode = PRUNE_2D_FAST;
@@ -333,24 +332,22 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
    // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
    // it with cpi->sf.disable_wedge_search_var_thresh.
    sf->disable_wedge_interintra_search = 1;
-    sf->perform_coeff_opt = boosted ? 0 : 3;
+    sf->perform_coeff_opt = is_boosted_arf2_bwd_type ? 2 : 3;
  }

  if (speed >= 4) {
    sf->use_intra_txb_hash = 0;
-    sf->use_mb_rd_hash = 0;
    sf->tx_type_search.fast_intra_tx_type_search = 1;
    sf->use_square_partition_only_threshold =
        boosted ? BLOCK_128X128 : BLOCK_4X4;
-    sf->tx_size_search_method =
-        frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
    sf->adaptive_pred_interp_filter = 0;
    sf->adaptive_mode_search = 1;
    sf->cb_partition_search = !boosted;
    sf->alt_ref_search_fp = 1;
    sf->skip_sharp_interp_filter_search = 1;
-    sf->perform_coeff_opt = boosted ? 0 : 4;
+    sf->perform_coeff_opt = is_boosted_arf2_bwd_type ? 2 : 4;
+    sf->adaptive_txb_search_level = boosted ? 2 : 3;
  }

  if (speed >= 5) {
@@ -400,6 +397,8 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
    sf->mv.search_method = FAST_HEX;
    sf->partition_search_type = REFERENCE_PARTITION;
    sf->mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+    // TODO(any): evaluate adaptive_mode_search=1 for speed 7 & 8
+    sf->adaptive_mode_search = 2;
  }
  if (speed >= 8) {
    sf->mv.search_method = FAST_DIAMOND;
@@ -408,12 +407,12 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
  }
 }

-void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
+void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
  SPEED_FEATURES *const sf = &cpi->sf;
  const AV1EncoderConfig *const oxcf = &cpi->oxcf;

  if (oxcf->mode == GOOD) {
-    set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
+    set_good_speed_feature_framesize_dependent(cpi, sf, speed);
  }

  if (sf->disable_split_mask == DISABLE_ALL_SPLIT) {
@@ -427,7 +426,7 @@ void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
    cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
 }

-void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
+void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) {
  AV1_COMMON *const cm = &cpi->common;
  SPEED_FEATURES *const sf = &cpi->sf;
  MACROBLOCK *const x = &cpi->td.mb;
@@ -576,7 +575,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
  sf->perform_coeff_opt = 0;

  if (oxcf->mode == GOOD)
-    set_good_speed_features_framesize_independent(cpi, sf, oxcf->speed);
+    set_good_speed_features_framesize_independent(cpi, sf, speed);

  if (!cpi->seq_params_locked) {
    cpi->common.seq_params.enable_dual_filter &= !sf->disable_dual_filter;
@@ -591,28 +590,31 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
  cpi->diamond_search_sad = av1_diamond_search_sad;

  sf->allow_exhaustive_searches = 1;
-  int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed;
+
+  const int mesh_speed = AOMMIN(speed, MAX_MESH_SPEED);
  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
    sf->exhaustive_searches_thresh = (1 << 24);
  else
    sf->exhaustive_searches_thresh = (1 << 25);
-  sf->max_exaustive_pct = good_quality_max_mesh_pct[speed];
-  if (speed > 0)
+  sf->max_exaustive_pct = good_quality_max_mesh_pct[mesh_speed];
+  if (mesh_speed > 0)
    sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1;

  for (i = 0; i < MAX_MESH_STEP; ++i) {
-    sf->mesh_patterns[i].range = good_quality_mesh_patterns[speed][i].range;
+    sf->mesh_patterns[i].range =
+        good_quality_mesh_patterns[mesh_speed][i].range;
    sf->mesh_patterns[i].interval =
-        good_quality_mesh_patterns[speed][i].interval;
+        good_quality_mesh_patterns[mesh_speed][i].interval;
  }
  if ((frame_is_intra_only(cm) && cm->allow_screen_content_tools) &&
      (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION ||
       cpi->oxcf.content == AOM_CONTENT_SCREEN)) {
    for (i = 0; i < MAX_MESH_STEP; ++i) {
-      sf->mesh_patterns[i].range = intrabc_mesh_patterns[speed][i].range;
-      sf->mesh_patterns[i].interval = intrabc_mesh_patterns[speed][i].interval;
+      sf->mesh_patterns[i].range = intrabc_mesh_patterns[mesh_speed][i].range;
+      sf->mesh_patterns[i].interval =
+          intrabc_mesh_patterns[mesh_speed][i].interval;
    }
-    sf->max_exaustive_pct = intrabc_max_mesh_pct[speed];
+    sf->max_exaustive_pct = intrabc_max_mesh_pct[mesh_speed];
  }

  // Slow quant, dct and trellis not worthwhile for first pass
@@ -638,7 +640,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
  cpi->optimize_speed_feature =
      oxcf->pass != 1 ? sf->optimize_coefficients : NO_TRELLIS_OPT;
  // FIXME: trellis not very efficient for quantisation matrices
-  if (cm->using_qmatrix) cpi->optimize_speed_feature = NO_TRELLIS_OPT;
+  if (oxcf->using_qm) cpi->optimize_speed_feature = NO_TRELLIS_OPT;
  if (oxcf->disable_trellis_quant) cpi->optimize_speed_feature = NO_TRELLIS_OPT;

  x->min_partition_size = sf->default_min_partition_size;
@@ -653,9 +655,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
      comp_type_rd_threshold_mul[sf->prune_comp_type_by_comp_avg];
  cpi->max_comp_type_rd_threshold_div =
      comp_type_rd_threshold_div[sf->prune_comp_type_by_comp_avg];
-  int tx_domain_speed = (oxcf->speed >= MAX_TX_DOMAIN_EVAL_SPEED)
-                            ? MAX_TX_DOMAIN_EVAL_SPEED
-                            : oxcf->speed;
+  const int tx_domain_speed = AOMMIN(speed, MAX_TX_DOMAIN_EVAL_SPEED);
  cpi->tx_domain_dist_threshold = tx_domain_dist_thresholds[tx_domain_speed];

  // assert ensures that coeff_opt_dist_thresholds is accessed correctly
@@ -656,8 +656,10 @@ typedef struct SPEED_FEATURES {

 struct AV1_COMP;

-void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi);
-void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi);
+void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi,
+                                                  int speed);
+void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi,
+                                                int speed);

 #ifdef __cplusplus
 }  // extern "C"
@@ -765,7 +765,8 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
  // Save input state
  uint8_t *input_buffer[MAX_MB_PLANE];
  int i;
-  if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  const int is_hbd = is_cur_buf_hbd(mbd);
+  if (is_hbd) {
    predictor = CONVERT_TO_BYTEPTR(predictor16);
  } else {
    predictor = predictor8;
@@ -887,20 +888,21 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
              const unsigned int w = plane ? mb_uv_width : BW;
              const unsigned int h = plane ? mb_uv_height : BH;

-              if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+              if (is_hbd) {
                highbd_apply_temporal_filter_self(pred, pred_stride, w, h,
                                                  blk_fw[0], accum, cnt);
-              else
+              } else {
                apply_temporal_filter_self(pred, pred_stride, w, h, blk_fw[0],
                                           accum, cnt);
+              }

              pred += BLK_PELS;
              accum += BLK_PELS;
              cnt += BLK_PELS;
            }
          } else {
-            if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-              int adj_strength = strength + 2 * (mbd->bd - 8);
+            if (is_hbd) {
+              const int adj_strength = strength + 2 * (mbd->bd - 8);

              if (num_planes <= 1) {
                // Single plane case
@@ -943,7 +945,7 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
      }

      // Normalize filter output to produce AltRef frame
-      if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (is_hbd) {
        uint16_t *dst1_16;
        uint16_t *dst2_16;
        dst1 = cpi->alt_ref_buffer.y_buffer;
@@ -1139,7 +1141,7 @@ static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
  MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
  struct lookahead_entry *buf = av1_lookahead_peek(cpi->lookahead, distance);
  double noiselevel;
-  if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (is_cur_buf_hbd(mbd)) {
    noiselevel = highbd_estimate_noise(
        buf->img.y_buffer, buf->img.y_crop_width, buf->img.y_crop_height,
        buf->img.y_stride, mbd->bd, EDGE_THRESHOLD);
@@ -0,0 +1,595 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_codec.h"
+
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/reconinter_enc.h"
+
+typedef struct GF_PICTURE {
+  YV12_BUFFER_CONFIG *frame;
+  int ref_frame[7];
+} GF_PICTURE;
+
+static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
+                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                               TX_SIZE tx_size, int64_t *recon_error,
+                               int64_t *sse) {
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size];
+  uint16_t eob;
+  int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+  const int shift = tx_size == TX_32X32 ? 0 : 2;
+
+  av1_quantize_fp_32x32(coeff, pix_num, p->zbin_QTX, p->round_fp_QTX,
+                        p->quant_fp_QTX, p->quant_shift_QTX, qcoeff, dqcoeff,
+                        p->dequant_QTX, &eob, scan_order->scan,
+                        scan_order->iscan);
+
+  *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+  *recon_error = AOMMAX(*recon_error, 1);
+
+  *sse = (*sse) >> shift;
+  *sse = AOMMAX(*sse, 1);
+}
+
+static void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                         TX_SIZE tx_size) {
+  switch (tx_size) {
+    case TX_8X8: aom_hadamard_8x8(src_diff, bw, coeff); break;
+    case TX_16X16: aom_hadamard_16x16(src_diff, bw, coeff); break;
+    case TX_32X32: aom_hadamard_32x32(src_diff, bw, coeff); break;
+    default: assert(0);
+  }
+}
+
+static uint32_t motion_compensated_prediction(AV1_COMP *cpi, ThreadData *td,
+                                              uint8_t *cur_frame_buf,
+                                              uint8_t *ref_frame_buf,
+                                              int stride, BLOCK_SIZE bsize,
+                                              int mi_row, int mi_col) {
+  AV1_COMMON *cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  const SEARCH_METHODS search_method = NSTEP;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  uint32_t bestsme = UINT_MAX;
+  int distortion;
+  uint32_t sse;
+  int cost_list[5];
+  const MvLimits tmp_mv_limits = x->mv_limits;
+
+  MV best_ref_mv1 = { 0, 0 };
+  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+  // Setup frame pointers
+  x->plane[0].src.buf = cur_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = ref_frame_buf;
+  xd->plane[0].pre[0].stride = stride;
+
+  step_param = mv_sf->reduce_first_step_size;
+  step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+  av1_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param,
+                        search_method, 0, sadpb, cond_cost_list(cpi, cost_list),
+                        &best_ref_mv1, INT_MAX, 0, (MI_SIZE * mi_col),
+                        (MI_SIZE * mi_row), 0);
+
+  /* restore UMV window */
+  x->mv_limits = tmp_mv_limits;
+
+  const int pw = block_size_wide[bsize];
+  const int ph = block_size_high[bsize];
+  bestsme = cpi->find_fractional_mv_step(
+      x, cm, mi_row, mi_col, &best_ref_mv1, cpi->common.allow_high_precision_mv,
+      x->errorperbit, &cpi->fn_ptr[bsize], 0, mv_sf->subpel_iters_per_step,
+      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, NULL,
+      0, 0, pw, ph, 1, 1);
+
+  return bestsme;
+}
+
+static void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                            struct scale_factors *sf, GF_PICTURE *gf_picture,
+                            int frame_idx, int16_t *src_diff, tran_low_t *coeff,
+                            tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
+                            int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
+                            int64_t *recon_error, int64_t *sse,
+                            TplDepStats *tpl_stats) {
+  AV1_COMMON *cm = &cpi->common;
+  ThreadData *td = &cpi->td;
+
+  const int bw = 4 << mi_size_wide_log2[bsize];
+  const int bh = 4 << mi_size_high_log2[bsize];
+  const int pix_num = bw * bh;
+  int best_rf_idx = -1;
+  int_mv best_mv;
+  int64_t best_inter_cost = INT64_MAX;
+  int64_t inter_cost;
+  int rf_idx;
+  const InterpFilters kernel =
+      av1_make_interp_filters(EIGHTTAP_REGULAR, EIGHTTAP_REGULAR);
+
+  int64_t best_intra_cost = INT64_MAX;
+  int64_t intra_cost;
+  PREDICTION_MODE mode;
+  int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  MB_MODE_INFO mi_above, mi_left;
+
+  memset(tpl_stats, 0, sizeof(*tpl_stats));
+
+  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8;
+  xd->above_mbmi = (mi_row > 0) ? &mi_above : NULL;
+  xd->left_mbmi = (mi_col > 0) ? &mi_left : NULL;
+
+  // Intra prediction search
+  for (mode = DC_PRED; mode <= PAETH_PRED; ++mode) {
+    uint8_t *src, *dst;
+    int src_stride, dst_stride;
+
+    src = xd->cur_buf->y_buffer + mb_y_offset;
+    src_stride = xd->cur_buf->y_stride;
+
+    dst = &predictor[0];
+    dst_stride = bw;
+
+    xd->mi[0]->sb_type = bsize;
+    xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+
+    av1_predict_intra_block(
+        cm, xd, block_size_wide[bsize], block_size_high[bsize], tx_size, mode,
+        0, 0, FILTER_INTRA_MODES, src, src_stride, dst, dst_stride, 0, 0, 0);
+
+    if (is_cur_buf_hbd(xd)) {
+      aom_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+                                dst_stride, xd->bd);
+    } else {
+      aom_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+                         dst_stride);
+    }
+
+    wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+
+    intra_cost = aom_satd(coeff, pix_num);
+
+    if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
+  }
+
+  // Motion compensated prediction
+  best_mv.as_int = 0;
+
+  (void)mb_y_offset;
+  // Motion estimation column boundary
+  x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND));
+  x->mv_limits.col_max =
+      ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND);
+
+  for (rf_idx = 0; rf_idx < 7; ++rf_idx) {
+    if (ref_frame[rf_idx] == NULL) continue;
+
+    motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset,
+                                  ref_frame[rf_idx]->y_buffer + mb_y_offset,
+                                  xd->cur_buf->y_stride, bsize, mi_row, mi_col);
+
+    // TODO(jingning): Not yet support high bit-depth in the next three
+    // steps.
+    ConvolveParams conv_params = get_conv_params(0, 0, xd->bd);
+    WarpTypesAllowed warp_types;
+    memset(&warp_types, 0, sizeof(WarpTypesAllowed));
+
+    av1_build_inter_predictor(
+        ref_frame[rf_idx]->y_buffer + mb_y_offset, ref_frame[rf_idx]->y_stride,
+        &predictor[0], bw, &x->best_mv.as_mv, sf, bw, bh, &conv_params, kernel,
+        &warp_types, mi_col * MI_SIZE, mi_row * MI_SIZE, 0, 0, MV_PRECISION_Q3,
+        mi_col * MI_SIZE, mi_row * MI_SIZE, xd, 0);
+    if (is_cur_buf_hbd(xd)) {
+      aom_highbd_subtract_block(
+          bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
+          xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
+    } else {
+      aom_subtract_block(bh, bw, src_diff, bw,
+                         xd->cur_buf->y_buffer + mb_y_offset,
+                         xd->cur_buf->y_stride, &predictor[0], bw);
+    }
+    wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+
+    inter_cost = aom_satd(coeff, pix_num);
+    if (inter_cost < best_inter_cost) {
+      best_rf_idx = rf_idx;
+      best_inter_cost = inter_cost;
+      best_mv.as_int = x->best_mv.as_int;
+      get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error,
+                         sse);
+    }
+  }
+  best_intra_cost = AOMMAX(best_intra_cost, 1);
+  best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost);
+  tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2;
+  tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2;
+  tpl_stats->mc_dep_cost = tpl_stats->intra_cost + tpl_stats->mc_flow;
+
+  tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
+  tpl_stats->mv.as_int = best_mv.as_int;
+}
+
+static int round_floor(int ref_pos, int bsize_pix) {
+  int round;
+  if (ref_pos < 0)
+    round = -(1 + (-ref_pos - 1) / bsize_pix);
+  else
+    round = ref_pos / bsize_pix;
+
+  return round;
+}
+
+static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
+                            int ref_pos_col, int block, BLOCK_SIZE bsize) {
+  int width = 0, height = 0;
+  int bw = 4 << mi_size_wide_log2[bsize];
+  int bh = 4 << mi_size_high_log2[bsize];
+
+  switch (block) {
+    case 0:
+      width = grid_pos_col + bw - ref_pos_col;
+      height = grid_pos_row + bh - ref_pos_row;
+      break;
+    case 1:
+      width = ref_pos_col + bw - grid_pos_col;
+      height = grid_pos_row + bh - ref_pos_row;
+      break;
+    case 2:
+      width = grid_pos_col + bw - ref_pos_col;
+      height = ref_pos_row + bh - grid_pos_row;
+      break;
+    case 3:
+      width = ref_pos_col + bw - grid_pos_col;
+      height = ref_pos_row + bh - grid_pos_row;
+      break;
+    default: assert(0);
+  }
+
+  return width * height;
+}
+
+static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+                               int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+  TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index];
+  TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr;
+  MV mv = tpl_stats->mv.as_mv;
+  int mv_row = mv.row >> 3;
+  int mv_col = mv.col >> 3;
+
+  int ref_pos_row = mi_row * MI_SIZE + mv_row;
+  int ref_pos_col = mi_col * MI_SIZE + mv_col;
+
+  const int bw = 4 << mi_size_wide_log2[bsize];
+  const int bh = 4 << mi_size_high_log2[bsize];
+  const int mi_height = mi_size_high[bsize];
+  const int mi_width = mi_size_wide[bsize];
+  const int pix_num = bw * bh;
+
+  // top-left on grid block location in pixel
+  int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
+  int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
+  int block;
+
+  for (block = 0; block < 4; ++block) {
+    int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
+    int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
+
+    if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
+        grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
+      int overlap_area = get_overlap_area(
+          grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
+      int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
+      int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
+
+      int64_t mc_flow = tpl_stats->mc_dep_cost -
+                        (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) /
+                            tpl_stats->intra_cost;
+
+      int idx, idy;
+
+      for (idy = 0; idy < mi_height; ++idy) {
+        for (idx = 0; idx < mi_width; ++idx) {
+          TplDepStats *des_stats =
+              &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride +
+                         (ref_mi_col + idx)];
+
+          des_stats->mc_flow += (mc_flow * overlap_area) / pix_num;
+          des_stats->mc_ref_cost +=
+              ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) /
+              pix_num;
+          assert(overlap_area >= 0);
+        }
+      }
+    }
+  }
+}
+
+static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+                             int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+  int idx, idy;
+  const int mi_height = mi_size_high[bsize];
+  const int mi_width = mi_size_wide[bsize];
+
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      TplDepStats *tpl_ptr =
+          &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)];
+      tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
+                         BLOCK_4X4);
+    }
+  }
+}
+
+static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
+                            BLOCK_SIZE bsize, int stride,
+                            const TplDepStats *src_stats) {
+  const int mi_height = mi_size_high[bsize];
+  const int mi_width = mi_size_wide[bsize];
+  int idx, idy;
+
+  int64_t intra_cost = src_stats->intra_cost / (mi_height * mi_width);
+  int64_t inter_cost = src_stats->inter_cost / (mi_height * mi_width);
+
+  TplDepStats *tpl_ptr;
+
+  intra_cost = AOMMAX(1, intra_cost);
+  inter_cost = AOMMAX(1, inter_cost);
+
+  for (idy = 0; idy < mi_height; ++idy) {
+    tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col];
+    for (idx = 0; idx < mi_width; ++idx) {
+      tpl_ptr->intra_cost = intra_cost;
+      tpl_ptr->inter_cost = inter_cost;
+      tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
+      tpl_ptr->ref_frame_index = src_stats->ref_frame_index;
+      tpl_ptr->mv.as_int = src_stats->mv.as_int;
+      ++tpl_ptr;
+    }
+  }
+}
+
+static void mc_flow_dispenser(AV1_COMP *cpi, GF_PICTURE *gf_picture,
+                              int frame_idx) {
+  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+  YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
+  YV12_BUFFER_CONFIG *ref_frame[7] = {
+    NULL, NULL, NULL, NULL, NULL, NULL, NULL
+  };
+
+  AV1_COMMON *cm = &cpi->common;
+  struct scale_factors sf;
+  int rdmult, idx;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int mi_row, mi_col;
+
+  DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]);
+  DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]);
+  uint8_t *predictor;
+  DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+
+  const BLOCK_SIZE bsize = BLOCK_32X32;
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const int mi_height = mi_size_high[bsize];
+  const int mi_width = mi_size_wide[bsize];
+  int64_t recon_error, sse;
+
+  // Setup scaling factor
+  av1_setup_scale_factors_for_frame(
+      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+      this_frame->y_crop_width, this_frame->y_crop_height);
+
+  if (is_cur_buf_hbd(xd))
+    predictor = CONVERT_TO_BYTEPTR(predictor16);
+  else
+    predictor = predictor8;
+
+  // Prepare reference frame pointers. If any reference frame slot is
+  // unavailable, the pointer will be set to Null.
+  for (idx = 0; idx < 7; ++idx) {
+    int rf_idx = gf_picture[frame_idx].ref_frame[idx];
+    if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame;
+  }
+
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
+  xd->cur_buf = this_frame;
+
+  // Get rd multiplier set up.
+  rdmult = (int)av1_compute_rd_mult(cpi, tpl_frame->base_qindex);
+  if (rdmult < 1) rdmult = 1;
+  set_error_per_bit(&cpi->td.mb, rdmult);
+  av1_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex);
+
+  tpl_frame->is_valid = 1;
+
+  cm->base_qindex = tpl_frame->base_qindex;
+  av1_frame_init_quantizer(cpi);
+
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+    // Motion estimation row boundary
+    x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND));
+    x->mv_limits.row_max =
+        (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * AOM_INTERP_EXTEND);
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+      TplDepStats tpl_stats;
+      mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, src_diff, coeff,
+                      qcoeff, dqcoeff, mi_row, mi_col, bsize, tx_size,
+                      ref_frame, predictor, &recon_error, &sse, &tpl_stats);
+
+      // Motion flow dependency dispenser.
+      tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
+                      tpl_frame->stride, &tpl_stats);
+
+      tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
+                       bsize);
+    }
+  }
+}
+
+static void init_gop_frames(AV1_COMP *cpi, GF_PICTURE *gf_picture,
+                            const GF_GROUP *gf_group, int *tpl_group_frames,
+                            const EncodeFrameInput *const frame_input) {
+  AV1_COMMON *cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  int frame_idx = 0;
+  int i;
+  int gld_index = -1;
+  int alt_index = -1;
+  int lst_index = -1;
+  int extend_frame_count = 0;
+  int pframe_qindex = cpi->tpl_stats[2].base_qindex;
+
+  RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
+  int recon_frame_index[INTER_REFS_PER_FRAME + 1] = { -1, -1, -1, -1,
+                                                      -1, -1, -1, -1 };
+
+  // TODO(jingning): To be used later for gf frame type parsing.
+  (void)gf_group;
+
+  for (i = 0; i < FRAME_BUFFERS && frame_idx < INTER_REFS_PER_FRAME + 1; ++i) {
+    if (frame_bufs[i].ref_count == 0) {
+      alloc_frame_mvs(cm, &frame_bufs[i]);
+      if (aom_realloc_frame_buffer(
+              &frame_bufs[i].buf, cm->width, cm->height,
+              seq_params->subsampling_x, seq_params->subsampling_y,
+              seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+              cm->byte_alignment, NULL, NULL, NULL))
+        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate frame buffer");
+
+      recon_frame_index[frame_idx] = i;
+      ++frame_idx;
+    }
+  }
+
+  for (i = 0; i < INTER_REFS_PER_FRAME + 1; ++i) {
+    assert(recon_frame_index[i] >= 0);
+    cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
+  }
+
+  *tpl_group_frames = 0;
+
+  // Initialize Golden reference frame.
+  gf_picture[0].frame = NULL;
+  RefCntBuffer *ref_buf = get_ref_frame_buf(cm, GOLDEN_FRAME);
+  if (ref_buf) gf_picture[0].frame = &ref_buf->buf;
+  for (i = 0; i < 7; ++i) gf_picture[0].ref_frame[i] = -1;
+  gld_index = 0;
+  ++*tpl_group_frames;
+
+  // Initialize ARF frame
+  gf_picture[1].frame = frame_input->source;
+  gf_picture[1].ref_frame[0] = gld_index;
+  gf_picture[1].ref_frame[1] = lst_index;
+  gf_picture[1].ref_frame[2] = alt_index;
+  // TODO(yuec) Need o  figure out full AV1 reference model
+  for (i = 3; i < 7; ++i) gf_picture[1].ref_frame[i] = -1;
+  alt_index = 1;
+  ++*tpl_group_frames;
+
+  // Initialize P frames
+  for (frame_idx = 2; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
+    struct lookahead_entry *buf =
+        av1_lookahead_peek(cpi->lookahead, frame_idx - 2);
+
+    if (buf == NULL) break;
+
+    gf_picture[frame_idx].frame = &buf->img;
+    gf_picture[frame_idx].ref_frame[0] = gld_index;
+    gf_picture[frame_idx].ref_frame[1] = lst_index;
+    gf_picture[frame_idx].ref_frame[2] = alt_index;
+    for (i = 3; i < 7; ++i) gf_picture[frame_idx].ref_frame[i] = -1;
+
+    ++*tpl_group_frames;
+    lst_index = frame_idx;
+
+    if (frame_idx == cpi->rc.baseline_gf_interval + 1) break;
+  }
+
+  gld_index = frame_idx;
+  lst_index = AOMMAX(0, frame_idx - 1);
+  alt_index = -1;
+  ++frame_idx;
+
+  // Extend two frames outside the current gf group.
+  for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) {
+    struct lookahead_entry *buf =
+        av1_lookahead_peek(cpi->lookahead, frame_idx - 2);
+
+    if (buf == NULL) break;
+
+    cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex;
+
+    gf_picture[frame_idx].frame = &buf->img;
+    gf_picture[frame_idx].ref_frame[0] = gld_index;
+    gf_picture[frame_idx].ref_frame[1] = lst_index;
+    gf_picture[frame_idx].ref_frame[2] = alt_index;
+    for (i = 3; i < 7; ++i) gf_picture[frame_idx].ref_frame[i] = -1;
+    lst_index = frame_idx;
+    ++*tpl_group_frames;
+    ++extend_frame_count;
+  }
+}
+
+static void init_tpl_stats(AV1_COMP *cpi) {
+  int frame_idx;
+  for (frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
+    TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+    memset(tpl_frame->tpl_stats_ptr, 0,
+           tpl_frame->height * tpl_frame->width *
+               sizeof(*tpl_frame->tpl_stats_ptr));
+    tpl_frame->is_valid = 0;
+  }
+}
+
+void av1_tpl_setup_stats(AV1_COMP *cpi,
+                         const EncodeFrameInput *const frame_input) {
+  GF_PICTURE gf_picture[MAX_LAG_BUFFERS];
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  int tpl_group_frames = 0;
+  int frame_idx;
+
+  init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames, frame_input);
+
+  init_tpl_stats(cpi);
+
+  // Backward propagation from tpl_group_frames to 1.
+  for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx)
+    mc_flow_dispenser(cpi, gf_picture, frame_idx);
+}
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TPL_MODEL_H_
+#define AOM_AV1_ENCODER_TPL_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_tpl_setup_stats(AV1_COMP *cpi,
+                         const EncodeFrameInput *const frame_input);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_TPL_MODEL_H_
@@ -1407,6 +1407,13 @@ static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
  output[14] = x1[15];
  output[15] = x1[0];
 }
+
+static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) {
+  const __m256i scale__r = pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1));
+  const __m256i b = _mm256_madd_epi16(a, scale__r);
+  return _mm256_srai_epi32(b, NewSqrt2Bits);
+}
+
 static INLINE void fidentity16x16_new_avx2(const __m256i *input,
                                           __m256i *output, int8_t cos_bit) {
  (void)cos_bit;
@@ -1990,781 +1997,6 @@ static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output,
  }
 }

-void btf_16_avx2(__m256i w0, __m256i w1, __m256i in0, __m256i in1,
-                 __m128i *out0, __m128i *out1, __m128i *out2, __m128i *out3,
-                 __m256i __rounding, int8_t cos_bit) {
-  __m256i t0 = _mm256_unpacklo_epi16(in0, in1);
-  __m256i t1 = _mm256_unpackhi_epi16(in0, in1);
-  __m256i u0 = _mm256_madd_epi16(t0, w0);
-  __m256i u1 = _mm256_madd_epi16(t1, w0);
-  __m256i v0 = _mm256_madd_epi16(t0, w1);
-  __m256i v1 = _mm256_madd_epi16(t1, w1);
-
-  __m256i a0 = _mm256_add_epi32(u0, __rounding);
-  __m256i a1 = _mm256_add_epi32(u1, __rounding);
-  __m256i b0 = _mm256_add_epi32(v0, __rounding);
-  __m256i b1 = _mm256_add_epi32(v1, __rounding);
-
-  __m256i c0 = _mm256_srai_epi32(a0, cos_bit);
-  __m256i c1 = _mm256_srai_epi32(a1, cos_bit);
-  __m256i d0 = _mm256_srai_epi32(b0, cos_bit);
-  __m256i d1 = _mm256_srai_epi32(b1, cos_bit);
-
-  __m256i temp0 = _mm256_packs_epi32(c0, c1);
-  __m256i temp1 = _mm256_packs_epi32(d0, d1);
-
-  *out0 = _mm256_castsi256_si128(temp0);
-  *out1 = _mm256_castsi256_si128(temp1);
-  *out2 = _mm256_extractf128_si256(temp0, 0x01);
-  *out3 = _mm256_extractf128_si256(temp1, 0x01);
-}
-static INLINE void fdct8x8_new_avx2(const __m256i *input, __m256i *output,
-                                    int8_t cos_bit) {
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
-
-  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
-  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
-  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
-  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
-  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
-  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
-  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
-  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
-
-  // stage 1
-  __m256i x1[8];
-  x1[0] = _mm256_adds_epi16(input[0], input[7]);
-  x1[7] = _mm256_subs_epi16(input[0], input[7]);
-  x1[1] = _mm256_adds_epi16(input[1], input[6]);
-  x1[6] = _mm256_subs_epi16(input[1], input[6]);
-  x1[2] = _mm256_adds_epi16(input[2], input[5]);
-  x1[5] = _mm256_subs_epi16(input[2], input[5]);
-  x1[3] = _mm256_adds_epi16(input[3], input[4]);
-  x1[4] = _mm256_subs_epi16(input[3], input[4]);
-
-  // stage 2
-  __m256i x2[8];
-  x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
-  x2[3] = _mm256_subs_epi16(x1[0], x1[3]);
-  x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
-  x2[2] = _mm256_subs_epi16(x1[1], x1[2]);
-  x2[4] = x1[4];
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], __rounding,
-                  cos_bit);
-  x2[5] = x1[5];
-  x2[6] = x1[6];
-  x2[7] = x1[7];
-
-  // stage 3
-  __m256i x3[8];
-  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x2[0], &x2[1], __rounding,
-                  cos_bit);
-  x3[0] = x2[0];
-  x3[1] = x2[1];
-  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x2[2], &x2[3], __rounding,
-                  cos_bit);
-  x3[2] = x2[2];
-  x3[3] = x2[3];
-  x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
-  x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
-  x3[6] = _mm256_subs_epi16(x2[7], x2[6]);
-  x3[7] = _mm256_adds_epi16(x2[7], x2[6]);
-
-  // stage 4
-  __m256i x4[8];
-  x4[0] = x3[0];
-  x4[1] = x3[1];
-  x4[2] = x3[2];
-  x4[3] = x3[3];
-  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x3[4], &x3[7], __rounding,
-                  cos_bit);
-  x4[4] = x3[4];
-  x4[7] = x3[7];
-  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x3[5], &x3[6], __rounding,
-                  cos_bit);
-  x4[5] = x3[5];
-  x4[6] = x3[6];
-  // stage 5
-  output[0] = x4[0];
-  output[1] = x4[4];
-  output[2] = x4[2];
-  output[3] = x4[6];
-  output[4] = x4[1];
-  output[5] = x4[5];
-  output[6] = x4[3];
-  output[7] = x4[7];
-}
-static INLINE void fadst8x8_new_avx2(const __m256i *input, __m256i *output,
-                                     int8_t cos_bit) {
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const __m256i __zero = _mm256_setzero_si256();
-  const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
-
-  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
-  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
-  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
-  __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
-  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
-  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
-  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
-  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
-  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
-  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
-  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
-  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
-
-  // stage 1
-  __m256i x1[8];
-  x1[0] = input[0];
-  x1[1] = _mm256_subs_epi16(__zero, input[7]);
-  x1[2] = _mm256_subs_epi16(__zero, input[3]);
-  x1[3] = input[4];
-  x1[4] = _mm256_subs_epi16(__zero, input[1]);
-  x1[5] = input[6];
-  x1[6] = input[2];
-  x1[7] = _mm256_subs_epi16(__zero, input[5]);
-
-  // stage 2
-  __m256i x2[8];
-  x2[0] = x1[0];
-  x2[1] = x1[1];
-  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], __rounding,
-                  cos_bit);
-  x2[2] = x1[2];
-  x2[3] = x1[3];
-  x2[4] = x1[4];
-  x2[5] = x1[5];
-  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], __rounding,
-                  cos_bit);
-  x2[6] = x1[6];
-  x2[7] = x1[7];
-
-  // stage 3
-  __m256i x3[8];
-  x3[0] = _mm256_adds_epi16(x2[0], x2[2]);
-  x3[2] = _mm256_subs_epi16(x2[0], x2[2]);
-  x3[1] = _mm256_adds_epi16(x2[1], x2[3]);
-  x3[3] = _mm256_subs_epi16(x2[1], x2[3]);
-  x3[4] = _mm256_adds_epi16(x2[4], x2[6]);
-  x3[6] = _mm256_subs_epi16(x2[4], x2[6]);
-  x3[5] = _mm256_adds_epi16(x2[5], x2[7]);
-  x3[7] = _mm256_subs_epi16(x2[5], x2[7]);
-
-  // stage 4
-  __m256i x4[8];
-  x4[0] = x3[0];
-  x4[1] = x3[1];
-  x4[2] = x3[2];
-  x4[3] = x3[3];
-  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x3[4], &x3[5], __rounding,
-                  cos_bit);
-  x4[4] = x3[4];
-  x4[5] = x3[5];
-  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x3[6], &x3[7], __rounding,
-                  cos_bit);
-  x4[6] = x3[6];
-  x4[7] = x3[7];
-
-  // stage 5
-  __m256i x5[8];
-  x5[0] = _mm256_adds_epi16(x4[0], x4[4]);
-  x5[4] = _mm256_subs_epi16(x4[0], x4[4]);
-  x5[1] = _mm256_adds_epi16(x4[1], x4[5]);
-  x5[5] = _mm256_subs_epi16(x4[1], x4[5]);
-  x5[2] = _mm256_adds_epi16(x4[2], x4[6]);
-  x5[6] = _mm256_subs_epi16(x4[2], x4[6]);
-  x5[3] = _mm256_adds_epi16(x4[3], x4[7]);
-  x5[7] = _mm256_subs_epi16(x4[3], x4[7]);
-
-  // stage 6
-  __m256i x6[8];
-  btf_16_w16_avx2(cospi_p04_p60, cospi_p60_m04, &x5[0], &x5[1], __rounding,
-                  cos_bit);
-  x6[0] = x5[0];
-  x6[1] = x5[1];
-  btf_16_w16_avx2(cospi_p20_p44, cospi_p44_m20, &x5[2], &x5[3], __rounding,
-                  cos_bit);
-  x6[2] = x5[2];
-  x6[3] = x5[3];
-  btf_16_w16_avx2(cospi_p36_p28, cospi_p28_m36, &x5[4], &x5[5], __rounding,
-                  cos_bit);
-  x6[4] = x5[4];
-  x6[5] = x5[5];
-  btf_16_w16_avx2(cospi_p52_p12, cospi_p12_m52, &x5[6], &x5[7], __rounding,
-                  cos_bit);
-  x6[6] = x5[6];
-  x6[7] = x5[7];
-
-  // stage 7
-  output[0] = x6[1];
-  output[1] = x6[6];
-  output[2] = x6[3];
-  output[3] = x6[4];
-  output[4] = x6[5];
-  output[5] = x6[2];
-  output[6] = x6[7];
-  output[7] = x6[0];
-}
-static INLINE void fidentity8x8_new_avx2(const __m256i *input, __m256i *output,
-                                         int8_t cos_bit) {
-  (void)cos_bit;
-
-  output[0] = _mm256_adds_epi16(input[0], input[0]);
-  output[1] = _mm256_adds_epi16(input[1], input[1]);
-  output[2] = _mm256_adds_epi16(input[2], input[2]);
-  output[3] = _mm256_adds_epi16(input[3], input[3]);
-  output[4] = _mm256_adds_epi16(input[4], input[4]);
-  output[5] = _mm256_adds_epi16(input[5], input[5]);
-  output[6] = _mm256_adds_epi16(input[6], input[6]);
-  output[7] = _mm256_adds_epi16(input[7], input[7]);
-}
-static INLINE void fdct8x16_new_avx2(const __m128i *input, __m128i *output,
-                                     int8_t cos_bit) {
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
-  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
-  __m128i temp0, temp1, temp2, temp3;
-  __m256i in0, in1;
-  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
-  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
-  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
-  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
-  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
-  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
-  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
-  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
-  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
-  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
-  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
-  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
-  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
-  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
-  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
-  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
-  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
-
-  __m256i cospi_arr[12];
-
-  cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m32_p32),
-                                         cospi_m32_p32, 0x1);
-  cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
-                                         cospi_p32_p32, 0x1);
-  cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
-                                         cospi_p48_p16, 0x1);
-  cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
-                                         cospi_m16_p48, 0x1);
-  cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m16_p48),
-                                         cospi_m48_m16, 0x1);
-  cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_p16),
-                                         cospi_m16_p48, 0x1);
-  cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_p08),
-                                         cospi_p24_p40, 0x1);
-  cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m08_p56),
-                                         cospi_m40_p24, 0x1);
-  cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p60_p04),
-                                         cospi_p28_p36, 0x1);
-  cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m04_p60),
-                                         cospi_m36_p28, 0x1);
-  cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p44_p20),
-                                          cospi_p12_p52, 0x1);
-  cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m20_p44),
-                                          cospi_m52_p12, 0x1);
-
-  __m256i x[8];
-  x[0] =
-      _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[1], 0x1);
-  x[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[15]), input[14],
-                                 0x1);
-  x[2] =
-      _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[3], 0x1);
-  x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[13]), input[12],
-                                 0x1);
-  x[4] =
-      _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[4], 0x1);
-  x[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[11],
-                                 0x1);
-  x[6] =
-      _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[6], 0x1);
-  x[7] =
-      _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[9], 0x1);
-
-  // stage 1
-  __m256i x1[16];
-  x1[0] = _mm256_adds_epi16(x[0], x[1]);
-  x1[7] = _mm256_subs_epi16(x[0], x[1]);
-  x1[1] = _mm256_adds_epi16(x[2], x[3]);
-  x1[6] = _mm256_subs_epi16(x[2], x[3]);
-  x1[2] = _mm256_adds_epi16(x[4], x[5]);
-  x1[5] = _mm256_subs_epi16(x[4], x[5]);
-  x1[3] = _mm256_adds_epi16(x[6], x[7]);
-  x1[4] = _mm256_subs_epi16(x[6], x[7]);
-
-  // stage 2
-  __m256i x2[8];
-  x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
-  x2[7] = _mm256_subs_epi16(x1[0], x1[3]);
-  x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
-  x2[6] = _mm256_subs_epi16(x1[1], x1[2]);
-  x2[2] = x1[4];
-  x2[3] = x1[7];
-  btf_16_avx2(cospi_arr[0], cospi_arr[1], x1[5], x1[6], &temp0, &temp1, &temp2,
-              &temp3, __rounding_256, cos_bit);
-  x2[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp0, 0x1);
-  x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
-
-  // stage 3
-  __m256i x3[8];
-  x2[1] = _mm256_permute4x64_epi64(x2[1], 0x4e);
-  x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
-  x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
-  x3[2] = _mm256_blend_epi32(x2[7], x2[6], 0xf0);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, _mm256_castsi256_si128(x2[6]),
-              _mm256_extractf128_si256(x2[7], 0x01), temp0, temp1);
-  x3[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp1), temp0, 0x1);
-  x3[3] = _mm256_adds_epi16(x2[2], x2[4]);
-  x3[4] = _mm256_subs_epi16(x2[2], x2[4]);
-  x3[5] = _mm256_adds_epi16(x2[3], x2[5]);
-  x3[6] = _mm256_subs_epi16(x2[3], x2[5]);
-
-  // stage 4
-  __m256i x4[8];
-  in0 = _mm256_blend_epi32(x3[0], x3[1], 0xf0);
-  in1 = _mm256_permute2f128_si256(x3[0], x3[1], 0x21);
-  btf_16_avx2(cospi_arr[2], cospi_arr[3], in0, in1, &output[0], &output[8],
-              &output[4], &output[12], __rounding_256, cos_bit);
-  x4[2] = _mm256_adds_epi16(x3[2], x3[7]);
-  x4[3] = _mm256_subs_epi16(x3[2], x3[7]);
-  x4[4] = _mm256_permute2f128_si256(x3[3], x3[4], 0x20);
-  x4[5] = _mm256_permute2f128_si256(x3[6], x3[5], 0x20);
-  in0 = _mm256_permute2f128_si256(x3[3], x3[4], 0x31);
-  in1 = _mm256_permute2f128_si256(x3[5], x3[6], 0x31);
-  btf_16_avx2(cospi_arr[4], cospi_arr[5], in0, in1, &temp0, &temp1, &temp2,
-              &temp3, __rounding_256, cos_bit);
-  x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp2, 0x1);
-  x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
-
-  // stage 5
-  __m256i x5[8];
-  in0 = _mm256_permute2f128_si256(x4[2], x4[3], 0x31);
-  in1 = _mm256_permute2f128_si256(x4[2], x4[3], 0x20);
-  btf_16_avx2(cospi_arr[6], cospi_arr[7], in0, in1, &output[2], &output[14],
-              &output[10], &output[6], __rounding_256, cos_bit);
-  x5[4] = _mm256_adds_epi16(x4[4], x4[6]);
-  x5[5] = _mm256_subs_epi16(x4[4], x4[6]);
-  x5[6] = _mm256_adds_epi16(x4[5], x4[7]);
-  x5[7] = _mm256_subs_epi16(x4[5], x4[7]);
-
-  // stage 6
-  in0 = _mm256_permute2f128_si256(x5[4], x5[5], 0x20);
-  in1 = _mm256_permute2f128_si256(x5[6], x5[7], 0x31);
-  btf_16_avx2(cospi_arr[8], cospi_arr[9], in0, in1, &output[1], &output[15],
-              &output[9], &output[7], __rounding_256, cos_bit);
-  in0 = _mm256_permute2f128_si256(x5[5], x5[4], 0x31);
-  in1 = _mm256_permute2f128_si256(x5[7], x5[6], 0x20);
-  btf_16_avx2(cospi_arr[10], cospi_arr[11], in0, in1, &output[5], &output[11],
-              &output[13], &output[3], __rounding_256, cos_bit);
-}
-static INLINE void fadst8x16_new_avx2(const __m128i *input, __m128i *output,
-                                      int8_t cos_bit) {
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const __m256i __zero = _mm256_setzero_si256();
-  const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
-  __m256i in0, in1;
-  __m128i temp0, temp1, temp2, temp3;
-
-  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
-  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
-  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
-  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
-  __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
-  __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
-  __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
-  __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
-  __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
-  __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
-  __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
-  __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
-  __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
-  __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
-  __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
-  __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
-  __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
-  __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
-  __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
-  __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
-  __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
-  __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
-  __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
-  __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
-  __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
-  __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
-
-  __m256i cospi_arr[20];
-
-  cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
-                                         cospi_p32_p32, 0x1);
-  cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
-                                         cospi_p32_m32, 0x1);
-  cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
-                                         cospi_p32_p32, 0x1);
-  cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
-                                         cospi_p32_m32, 0x1);
-  cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
-                                         cospi_m48_p16, 0x1);
-  cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
-                                         cospi_p16_p48, 0x1);
-  cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
-                                         cospi_m48_p16, 0x1);
-  cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
-                                         cospi_p16_p48, 0x1);
-  cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
-                                         cospi_p40_p24, 0x1);
-  cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_m08),
-                                         cospi_p24_m40, 0x1);
-  cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m56_p08),
-                                          cospi_m24_p40, 0x1);
-  cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
-                                          cospi_p40_p24, 0x1);
-  cospi_arr[12] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p02_p62),
-                                          cospi_p10_p54, 0x1);
-  cospi_arr[13] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p62_m02),
-                                          cospi_p54_m10, 0x1);
-  cospi_arr[14] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p18_p46),
-                                          cospi_p26_p38, 0x1);
-  cospi_arr[15] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p46_m18),
-                                          cospi_p38_m26, 0x1);
-  cospi_arr[16] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p34_p30),
-                                          cospi_p42_p22, 0x1);
-  cospi_arr[17] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p30_m34),
-                                          cospi_p22_m42, 0x1);
-  cospi_arr[18] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p50_p14),
-                                          cospi_p58_p06, 0x1);
-  cospi_arr[19] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p14_m50),
-                                          cospi_p06_m58, 0x1);
-
-  __m256i x[8];
-  x[0] =
-      _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[4], 0x1);
-  x[1] =
-      _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[6], 0x1);
-  x[2] =
-      _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[12], 0x1);
-  x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[14],
-                                 0x1);
-  x[4] =
-      _mm256_insertf128_si256(_mm256_castsi128_si256(input[1]), input[9], 0x1);
-  x[5] =
-      _mm256_insertf128_si256(_mm256_castsi128_si256(input[3]), input[11], 0x1);
-  x[6] =
-      _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[13], 0x1);
-  x[7] =
-      _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[15], 0x1);
-
-  // stage 1
-  __m256i x1[8];
-  x1[0] = x[0];
-  x1[1] = _mm256_subs_epi16(__zero, x[7]);
-  x1[2] = x[2];
-  x1[3] = _mm256_subs_epi16(__zero, x[5]);
-  x1[4] = _mm256_subs_epi16(__zero, x[4]);
-  x1[5] = x[3];
-  x1[6] = _mm256_subs_epi16(__zero, x[6]);
-  x1[7] = x[1];
-
-  // stage 2
-  __m256i x2[8];
-  x2[0] = _mm256_blend_epi32(x1[0], x1[1], 0xf0);
-  x2[3] = _mm256_blend_epi32(x1[3], x1[2], 0xf0);
-  x2[4] = _mm256_blend_epi32(x1[4], x1[5], 0xf0);
-  x2[7] = _mm256_blend_epi32(x1[7], x1[6], 0xf0);
-  in0 = _mm256_blend_epi32(x1[1], x1[0], 0xf0);
-  in1 = _mm256_blend_epi32(x1[2], x1[3], 0xf0);
-  btf_16_avx2(cospi_arr[0], cospi_arr[1], in0, in1, &temp0, &temp1, &temp2,
-              &temp3, __rounding_256, cos_bit);
-  x2[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
-  x2[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
-  in0 = _mm256_permute2f128_si256(x1[7], x1[6], 0x21);
-  in1 = _mm256_permute2f128_si256(x1[4], x1[5], 0x21);
-  btf_16_avx2(cospi_arr[2], cospi_arr[3], in0, in1, &temp0, &temp1, &temp2,
-              &temp3, __rounding_256, cos_bit);
-  x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
-  x2[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
-
-  // stage 3
-  __m256i x3[8];
-  x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
-  x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
-  x3[2] = _mm256_adds_epi16(x2[3], x2[2]);
-  x3[3] = _mm256_subs_epi16(x2[3], x2[2]);
-  x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
-  x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
-  x3[6] = _mm256_adds_epi16(x2[7], x2[6]);
-  x3[7] = _mm256_subs_epi16(x2[7], x2[6]);
-
-  // stage 4
-  __m256i x4[8];
-  x4[0] = x3[0];
-  x4[1] = x3[1];
-  x4[4] = x3[4];
-  x4[5] = x3[5];
-  in0 = _mm256_permute2f128_si256(x3[2], x3[3], 0x20);
-  in1 = _mm256_permute2f128_si256(x3[2], x3[3], 0x31);
-  btf_16_avx2(cospi_arr[4], cospi_arr[5], in0, in1, &temp0, &temp1, &temp2,
-              &temp3, __rounding_256, cos_bit);
-  x4[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
-  x4[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
-  in0 = _mm256_permute2f128_si256(x3[6], x3[7], 0x20);
-  in1 = _mm256_permute2f128_si256(x3[6], x3[7], 0x31);
-  btf_16_avx2(cospi_arr[6], cospi_arr[7], in0, in1, &temp0, &temp1, &temp2,
-              &temp3, __rounding_256, cos_bit);
-  x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
-  x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
-
-  // stage 5
-  __m256i x5[8];
-  x5[0] = _mm256_adds_epi16(x4[0], x4[2]);
-  x5[1] = _mm256_subs_epi16(x4[0], x4[2]);
-  x5[2] = _mm256_adds_epi16(x4[1], x4[3]);
-  x5[3] = _mm256_subs_epi16(x4[1], x4[3]);
-  x5[4] = _mm256_adds_epi16(x4[4], x4[6]);
-  x5[5] = _mm256_subs_epi16(x4[4], x4[6]);
-  x5[6] = _mm256_adds_epi16(x4[5], x4[7]);
-  x5[7] = _mm256_subs_epi16(x4[5], x4[7]);
-
-  // stage 6
-  __m256i x6[8];
-  x6[0] = x5[0];
-  x6[1] = x5[2];
-  x6[2] = x5[1];
-  x6[3] = x5[3];
-  in0 = _mm256_permute2f128_si256(x5[4], x5[6], 0x20);
-  in1 = _mm256_permute2f128_si256(x5[4], x5[6], 0x31);
-  btf_16_avx2(cospi_arr[8], cospi_arr[9], in0, in1, &temp0, &temp1, &temp2,
-              &temp3, __rounding_256, cos_bit);
-  x6[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
-  x6[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
-  in0 = _mm256_permute2f128_si256(x5[5], x5[7], 0x20);
-  in1 = _mm256_permute2f128_si256(x5[5], x5[7], 0x31);
-  btf_16_avx2(cospi_arr[10], cospi_arr[11], in0, in1, &temp0, &temp1, &temp2,
-              &temp3, __rounding_256, cos_bit);
-  x6[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
-  x6[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
-
-  // stage 7
-  __m256i x7[8];
-  x7[0] = _mm256_adds_epi16(x6[0], x6[4]);
-  x7[1] = _mm256_subs_epi16(x6[0], x6[4]);
-  x7[2] = _mm256_adds_epi16(x6[1], x6[5]);
-  x7[3] = _mm256_subs_epi16(x6[1], x6[5]);
-  x7[4] = _mm256_adds_epi16(x6[2], x6[6]);
-  x7[5] = _mm256_subs_epi16(x6[2], x6[6]);
-  x7[6] = _mm256_adds_epi16(x6[3], x6[7]);
-  x7[7] = _mm256_subs_epi16(x6[3], x6[7]);
-
-  // stage 8
-  in0 = _mm256_permute2f128_si256(x7[0], x7[2], 0x20);
-  in1 = _mm256_permute2f128_si256(x7[0], x7[2], 0x31);
-  btf_16_avx2(cospi_arr[12], cospi_arr[13], in0, in1, &output[15], &output[0],
-              &output[13], &output[2], __rounding_256, cos_bit);
-  in0 = _mm256_permute2f128_si256(x7[4], x7[6], 0x20);
-  in1 = _mm256_permute2f128_si256(x7[4], x7[6], 0x31);
-  btf_16_avx2(cospi_arr[14], cospi_arr[15], in0, in1, &output[11], &output[4],
-              &output[9], &output[6], __rounding_256, cos_bit);
-  in0 = _mm256_permute2f128_si256(x7[1], x7[3], 0x20);
-  in1 = _mm256_permute2f128_si256(x7[1], x7[3], 0x31);
-  btf_16_avx2(cospi_arr[16], cospi_arr[17], in0, in1, &output[7], &output[8],
-              &output[5], &output[10], __rounding_256, cos_bit);
-  in0 = _mm256_permute2f128_si256(x7[5], x7[7], 0x20);
-  in1 = _mm256_permute2f128_si256(x7[5], x7[7], 0x31);
-  btf_16_avx2(cospi_arr[18], cospi_arr[19], in0, in1, &output[3], &output[12],
-              &output[1], &output[14], __rounding_256, cos_bit);
-}
-static INLINE void fidentity8x16_new_avx2(const __m128i *input, __m128i *output,
-                                          int8_t cos_bit) {
-  (void)cos_bit;
-  const __m256i one = _mm256_set1_epi16(1);
-  __m256i temp;
-  for (int i = 0; i < 16; i += 2) {
-    temp = _mm256_insertf128_si256(_mm256_castsi128_si256(input[i]),
-                                   input[i + 1], 0x1);
-    const __m256i a_lo = _mm256_unpacklo_epi16(temp, one);
-    const __m256i a_hi = _mm256_unpackhi_epi16(temp, one);
-    const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
-    const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
-    temp = _mm256_packs_epi32(b_lo, b_hi);
-    output[i] = _mm256_castsi256_si128(temp);
-    output[i + 1] = _mm256_extractf128_si256(temp, 0x1);
-  }
-}
-static const transform_1d_avx2 row_txfm8x16_arr[TX_TYPES] = {
-  fdct8x8_new_avx2,       // DCT_DCT
-  fdct8x8_new_avx2,       // ADST_DCT
-  fadst8x8_new_avx2,      // DCT_ADST
-  fadst8x8_new_avx2,      // ADST_ADST
-  fdct8x8_new_avx2,       // FLIPADST_DCT
-  fadst8x8_new_avx2,      // DCT_FLIPADST
-  fadst8x8_new_avx2,      // FLIPADST_FLIPADST
-  fadst8x8_new_avx2,      // ADST_FLIPADST
-  fadst8x8_new_avx2,      // FLIPADST_ADST
-  fidentity8x8_new_avx2,  // IDTX
-  fidentity8x8_new_avx2,  // V_DCT
-  fdct8x8_new_avx2,       // H_DCT
-  fidentity8x8_new_avx2,  // V_ADST
-  fadst8x8_new_avx2,      // H_ADST
-  fidentity8x8_new_avx2,  // V_FLIPADST
-  fadst8x8_new_avx2       // H_FLIPADST
-};
-
-static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
-  fdct8x16_new_avx2,       // DCT_DCT
-  fadst8x16_new_avx2,      // ADST_DCT
-  fdct8x16_new_avx2,       // DCT_ADST
-  fadst8x16_new_avx2,      // ADST_ADST
-  fadst8x16_new_avx2,      // FLIPADST_DCT
-  fdct8x16_new_avx2,       // DCT_FLIPADST
-  fadst8x16_new_avx2,      // FLIPADST_FLIPADST
-  fadst8x16_new_avx2,      // ADST_FLIPADST
-  fadst8x16_new_avx2,      // FLIPADST_ADST
-  fidentity8x16_new_avx2,  // IDTX
-  fdct8x16_new_avx2,       // V_DCT
-  fidentity8x16_new_avx2,  // H_DCT
-  fadst8x16_new_avx2,      // V_ADST
-  fidentity8x16_new_avx2,  // H_ADST
-  fadst8x16_new_avx2,      // V_FLIPADST
-  fidentity8x16_new_avx2   // H_FLIPADST
-};
-static const transform_1d_avx2 col_txfm16x8_arr[TX_TYPES] = {
-  fdct8x8_new_avx2,       // DCT_DCT
-  fadst8x8_new_avx2,      // ADST_DCT
-  fdct8x8_new_avx2,       // DCT_ADST
-  fadst8x8_new_avx2,      // ADST_ADST
-  fadst8x8_new_avx2,      // FLIPADST_DCT
-  fdct8x8_new_avx2,       // DCT_FLIPADST
-  fadst8x8_new_avx2,      // FLIPADST_FLIPADST
-  fadst8x8_new_avx2,      // ADST_FLIPADST
-  fadst8x8_new_avx2,      // FLIPADST_ADST
-  fidentity8x8_new_avx2,  // IDTX
-  fdct8x8_new_avx2,       // V_DCT
-  fidentity8x8_new_avx2,  // H_DCT
-  fadst8x8_new_avx2,      // V_ADST
-  fidentity8x8_new_avx2,  // H_ADST
-  fadst8x8_new_avx2,      // V_FLIPADST
-  fidentity8x8_new_avx2,  // H_FLIPADST
-};
-
-static const transform_1d_sse2 row_txfm16x8_arr[TX_TYPES] = {
-  fdct8x16_new_avx2,       // DCT_DCT
-  fdct8x16_new_avx2,       // ADST_DCT
-  fadst8x16_new_avx2,      // DCT_ADST
-  fadst8x16_new_avx2,      // ADST_ADST
-  fdct8x16_new_avx2,       // FLIPADST_DCT
-  fadst8x16_new_avx2,      // DCT_FLIPADST
-  fadst8x16_new_avx2,      // FLIPADST_FLIPADST
-  fadst8x16_new_avx2,      // ADST_FLIPADST
-  fadst8x16_new_avx2,      // FLIPADST_ADST
-  fidentity8x16_new_avx2,  // IDTX
-  fidentity8x16_new_avx2,  // V_DCT
-  fdct8x16_new_avx2,       // H_DCT
-  fidentity8x16_new_avx2,  // V_ADST
-  fadst8x16_new_avx2,      // H_ADST
-  fidentity8x16_new_avx2,  // V_FLIPADST
-  fadst8x16_new_avx2       // H_FLIPADST
-};
-void lowbd_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *output,
-                                int stride, TX_TYPE tx_type, int bd) {
-  (void)bd;
-  __m128i buf0[16], buf1[16];
-  __m256i buf2[8];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
-  const int txw_idx = get_txw_idx(TX_8X16);
-  const int txh_idx = get_txh_idx(TX_8X16);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = 8;
-  const int height = 16;
-  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
-  const transform_1d_avx2 row_txfm = row_txfm8x16_arr[tx_type];
-  int ud_flip, lr_flip;
-
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  if (ud_flip) {
-    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
-  } else {
-    load_buffer_16bit_to_16bit(input, stride, buf0, height);
-  }
-  round_shift_16bit(buf0, height, shift[0]);
-  col_txfm(buf0, buf0, cos_bit_col);
-  round_shift_16bit(buf0, height, shift[1]);
-  transpose_16bit_8x8(buf0, buf1);
-  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
-
-  __m128i *bufl, *bufu;
-  if (lr_flip) {
-    bufl = buf0;
-    bufu = buf0 + 8;
-    flip_buf_sse2(buf1 + width * 0, bufl, width);
-    flip_buf_sse2(buf1 + width * 1, bufu, width);
-  } else {
-    bufl = buf1 + width * 0;
-    bufu = buf1 + width * 1;
-  }
-  pack_reg(bufl, bufu, buf2);
-  row_txfm(buf2, buf2, cos_bit_row);
-  round_shift_16bit_w16_avx2(buf2, width, shift[2]);
-  transpose_16bit_16x8_avx2(buf2, buf2);
-  store_rect_buffer_16bit_to_32bit_w8_avx2(buf2, output, width, 8);
-}
-void lowbd_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *output,
-                                int stride, TX_TYPE tx_type, int bd) {
-  (void)bd;
-  __m128i buf0[16], buf1[16];
-  __m256i buf2[8];
-  const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
-  const int txw_idx = get_txw_idx(TX_16X8);
-  const int txh_idx = get_txh_idx(TX_16X8);
-  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
-  const int width = 16;
-  const int height = 8;
-  const transform_1d_avx2 col_txfm = col_txfm16x8_arr[tx_type];
-  const transform_1d_sse2 row_txfm = row_txfm16x8_arr[tx_type];
-  __m128i *buf;
-  int ud_flip, lr_flip;
-
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
-  if (ud_flip) {
-    load_buffer_16bit_to_16bit_flip(input + 8 * 0, stride, buf0, height);
-    load_buffer_16bit_to_16bit_flip(input + 8 * 1, stride, &buf0[8], height);
-  } else {
-    load_buffer_16bit_to_16bit(input + 8 * 0, stride, buf0, height);
-    load_buffer_16bit_to_16bit(input + 8 * 1, stride, &buf0[8], height);
-  }
-  pack_reg(buf0, &buf0[8], buf2);
-  round_shift_16bit_w16_avx2(buf2, height, shift[0]);
-  col_txfm(buf2, buf2, cos_bit_col);
-  round_shift_16bit_w16_avx2(buf2, height, shift[1]);
-  transpose_16bit_16x8_avx2(buf2, buf2);
-  extract_reg(buf2, buf1);
-
-  if (lr_flip) {
-    buf = buf0;
-    flip_buf_sse2(buf1, buf, width);
-  } else {
-    buf = buf1;
-  }
-  row_txfm(buf, buf, cos_bit_row);
-  round_shift_16bit(buf, width, shift[2]);
-  transpose_16bit_8x8(buf, buf);
-  store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
-  transpose_16bit_8x8(buf + 8, buf + 8);
-  store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
-}
 static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
  av1_lowbd_fwd_txfm2d_4x4_sse2,   // 4x4 transform
  av1_lowbd_fwd_txfm2d_8x8_sse2,   // 8x8 transform
@@ -2773,8 +2005,8 @@ static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
  lowbd_fwd_txfm2d_64x64_avx2,     // 64x64 transform
  av1_lowbd_fwd_txfm2d_4x8_sse2,   // 4x8 transform
  av1_lowbd_fwd_txfm2d_8x4_sse2,   // 8x4 transform
-  lowbd_fwd_txfm2d_8x16_avx2,      // 8x16 transform
-  lowbd_fwd_txfm2d_16x8_avx2,      // 16x8 transform
+  av1_lowbd_fwd_txfm2d_8x16_sse2,  // 8x16 transform
+  av1_lowbd_fwd_txfm2d_16x8_sse2,  // 16x8 transform
  lowbd_fwd_txfm2d_16x32_avx2,     // 16x32 transform
  lowbd_fwd_txfm2d_32x16_avx2,     // 32x16 transform
  lowbd_fwd_txfm2d_32x64_avx2,     // 32x64 transform
@@ -101,8 +101,6 @@ set_aom_config_var(CONFIG_DENOISE 1 NUMBER
                   "Denoise/noise modeling support in encoder.")
 set_aom_config_var(CONFIG_FILEOPTIONS 1 NUMBER
                   "Enables encoder config file support.")
-set_aom_config_var(CONFIG_FIX_GF_LENGTH 1 NUMBER
-                   "Fix the GF length if possible")
 set_aom_config_var(CONFIG_INSPECTION 0 NUMBER "Enables bitstream inspection.")
 set_aom_config_var(CONFIG_INTERNAL_STATS 0 NUMBER
                   "Enables internal encoder stats.")
@@ -118,8 +116,6 @@ set_aom_config_var(DECODE_HEIGHT_LIMIT 0 NUMBER "Set limit for decode height.")
 set_aom_config_var(DECODE_WIDTH_LIMIT 0 NUMBER "Set limit for decode width.")

 # AV1 experiment flags.
-set_aom_config_var(CONFIG_COLLECT_INTER_MODE_RD_STATS 1 NUMBER
-                   "AV1 experiment flag.")
 set_aom_config_var(CONFIG_SPEED_STATS 0 NUMBER "AV1 experiment flag.")
 set_aom_config_var(CONFIG_COLLECT_RD_STATS 0 NUMBER "AV1 experiment flag.")
 set_aom_config_var(CONFIG_DIST_8X8 0 NUMBER "AV1 experiment flag.")
@@ -132,7 +128,9 @@ set_aom_config_var(CONFIG_2PASS_PARTITION_SEARCH_LVL 1 NUMBER
 set_aom_config_var(CONFIG_SHARP_SETTINGS 0 NUMBER "AV1 experiment flag.")
 set_aom_config_var(CONFIG_ONE_PASS_SVM 0 NUMBER "AV1 experiment flag.")
 set_aom_config_var(CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1 NUMBER
-                   "Disable full_pixel_motion_search_based_split on BLOCK_8X8")
+                   "Disable full_pixel_motion_search_based_split on BLOCK_8X8.")
+set_aom_config_var(CONFIG_COLLECT_PARTITION_STATS 0 NUMBER
+                   "Collect stats on partition decisions.")

 #
 # Variables in this section control optional features of the build system.
@@ -149,6 +149,11 @@ const AvxInterface *get_aom_encoder_by_name(const char *name) {

  return NULL;
 }
+
+// large scale tile encoding
+static const AvxInterface aom_lst_encoder = { "av1", LST_FOURCC,
+                                              &aom_codec_av1_cx };
+const AvxInterface *get_aom_lst_encoder(void) { return &aom_lst_encoder; }
 #endif  // CONFIG_AV1_ENCODER

 #if CONFIG_AV1_DECODER
@@ -85,6 +85,9 @@ enum {
  NV12,   // Tile output in NV12 format.
 } UENUM1BYTE(OUTPUT_FORMAT);

+// The fourcc for large_scale_tile encoding is "LSTC".
+#define LST_FOURCC 0x4354534c
+
 struct FileTypeDetectionBuffer {
  char buf[4];
  size_t buf_read;
@@ -150,6 +153,7 @@ typedef struct AvxInterface {
 int get_aom_encoder_count(void);
 const AvxInterface *get_aom_encoder_by_index(int i);
 const AvxInterface *get_aom_encoder_by_name(const char *name);
+const AvxInterface *get_aom_lst_encoder(void);

 int get_aom_decoder_count(void);
 const AvxInterface *get_aom_decoder_by_index(int i);
@@ -121,3 +121,7 @@ FILE *aom_video_reader_get_file(AvxVideoReader *reader) {
 const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader) {
  return &reader->info;
 }
+
+void aom_video_reader_set_fourcc(AvxVideoReader *reader, uint32_t fourcc) {
+  reader->info.codec_fourcc = fourcc;
+}
@@ -50,6 +50,9 @@ FILE *aom_video_reader_get_file(AvxVideoReader *reader);
 // Fills AvxVideoInfo with information from opened video file.
 const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader);

+// Set fourcc.
+void aom_video_reader_set_fourcc(AvxVideoReader *reader, uint32_t fourcc);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
@@ -75,3 +75,7 @@ int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer,

  return 1;
 }
+
+void aom_video_writer_set_fourcc(AvxVideoWriter *writer, uint32_t fourcc) {
+  writer->info.codec_fourcc = fourcc;
+}
@@ -37,6 +37,8 @@ void aom_video_writer_close(AvxVideoWriter *writer);
 // Writes frame bytes to the file.
 int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer,
                                 size_t size, int64_t pts);
+// Set fourcc.
+void aom_video_writer_set_fourcc(AvxVideoWriter *writer, uint32_t fourcc);

 #ifdef __cplusplus
 }  // extern "C"
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*
+ * See build_av1_dec_fuzzer.sh for building instructions.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory>
+
+#include "config/aom_config.h"
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "aom_ports/mem_ops.h"
+#include "common/ivfdec.h"
+
+static void close_file(FILE *file) { fclose(file); }
+
+extern "C" void usage_exit(void) { exit(EXIT_FAILURE); }
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  std::unique_ptr<FILE, decltype(&close_file)> file(
+      fmemopen((void *)data, size, "rb"), &close_file);
+  if (file == nullptr) {
+    return 0;
+  }
+
+  char header[32];
+  if (fread(header, 1, 32, file.get()) != 32) {
+    return 0;
+  }
+  const AvxInterface *decoder = get_aom_decoder_by_name("av1");
+  if (decoder == nullptr) {
+    return 0;
+  }
+
+  aom_codec_ctx_t codec;
+  // Set thread count in the range [1, 64].
+  const unsigned int threads = (header[0] & 0x3f) + 1;
+  aom_codec_dec_cfg_t cfg = { threads, 0, 0, CONFIG_LOWBITDEPTH };
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), &cfg, 0)) {
+    return 0;
+  }
+
+  uint8_t *buffer = nullptr;
+  size_t buffer_size = 0;
+  size_t frame_size = 0;
+  while (!ivf_read_frame(file.get(), &buffer, &frame_size, &buffer_size,
+                         nullptr)) {
+    const aom_codec_err_t err =
+        aom_codec_decode(&codec, buffer, frame_size, nullptr);
+    static_cast<void>(err);
+    aom_codec_iter_t iter = nullptr;
+    aom_image_t *img = nullptr;
+    while ((img = aom_codec_get_frame(&codec, &iter)) != nullptr) {
+    }
+  }
+  aom_codec_destroy(&codec);
+  free(buffer);
+  return 0;
+}
@@ -0,0 +1,78 @@
+#!/bin/bash
+#
+# Copyright (c) 2019, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+###############################################################################
+# Fuzzer for libaom decoder.
+# ==========================
+# Requirements
+# ---------------------
+# Clang6.0 or above (must support -fsanitize=fuzzer)
+#
+# References:
+# ---------------------
+# http://llvm.org/docs/LibFuzzer.html
+# https://github.com/google/oss-fuzz
+#
+# Steps to build / run
+# ---------------------
+
+set -eu
+
+# Have a copy of AOM and a build directory ready.
+if [[ $# -ne 2 ]]; then
+  echo "Pass in the AOM source tree as first argument, and a build directory "
+  echo "as the second argument. The AOM source tree can be obtained via: "
+  echo "  git clone https://aomedia.googlesource.com/aom"
+  exit 2
+fi
+if [[ -z "$CC" ]]; then
+  echo "Set the CC environment variable to point to your C compiler."
+  exit 2
+fi
+if [[ -z "$CXX" ]]; then
+  echo "Set the CXX environment variable to point to your C++ compiler."
+  exit 2
+fi
+
+AOM_DIR=$1
+BUILD_DIR=$2
+# Run CMake with address sanitizer enabled and build the codec.
+# Enable DO_RANGE_CHECK_CLAMP to suppress the noise of integer overflows
+# in the transform functions. Also set memory limits.
+EXTRA_C_FLAGS='-DDO_RANGE_CHECK_CLAMP=1 -DAOM_MAX_ALLOCABLE_MEMORY=1073741824'
+cd "${BUILD_DIR}"
+cmake "${AOM_DIR}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCONFIG_PIC=1 \
+  -DCONFIG_SCALABILITY=0 -DCONFIG_LOWBITDEPTH=1 -DCONFIG_AV1_ENCODER=0 \
+  -DENABLE_EXAMPLES=0 -DENABLE_DOCS=0 -DENABLE_TESTS=0 -DCONFIG_SIZE_LIMIT=1 \
+  -DDECODE_HEIGHT_LIMIT=12288 -DDECODE_WIDTH_LIMIT=12288 \
+  -DAOM_EXTRA_C_FLAGS="${EXTRA_C_FLAGS}" \
+  -DAOM_EXTRA_CXX_FLAGS="${EXTRA_C_FLAGS}" -DSANITIZE=address
+
+# Build the codec.
+make -j$(nproc)
+
+# Build some libaom utils that are not part of the core lib.
+$CC -std=c99 -c -I${AOM_DIR} -I${BUILD_DIR} \
+  ${AOM_DIR}/common/ivfdec.c -o ${BUILD_DIR}/ivfdec.o
+
+$CC -std=c99 -c -I${AOM_DIR} -I${BUILD_DIR} \
+  ${AOM_DIR}/common/tools_common.c -o ${BUILD_DIR}/tools_common.o
+
+# Build the av1 fuzzer
+$CXX -std=c++11 -DDECODER=av1 -I${AOM_DIR} -I${BUILD_DIR} \
+    -fsanitize=fuzzer -Wl,--start-group \
+    ${AOM_DIR}/examples/av1_dec_fuzzer.cc -o ${BUILD_DIR}/av1_dec_fuzzer \
+    ${BUILD_DIR}/libaom.a ${BUILD_DIR}/ivfdec.o ${BUILD_DIR}/tools_common.o \
+    -Wl,--end-group
+
+echo "Fuzzer built at ${BUILD_DIR}/av1_dec_fuzzer."
+echo "Create a corpus directory, copy IVF files in there, and run:"
+echo "  av1_dec_fuzzer CORPUS_DIR"
@@ -211,6 +211,8 @@ int main(int argc, char **argv) {
  num_references = (int)strtol(argv[3], NULL, 0);
  info = aom_video_reader_get_info(reader);

+  aom_video_reader_set_fourcc(reader, AV1_FOURCC);
+
  // The writer to write out ivf file in tile list OBU, which can be decoded by
  // AV1 decoder.
  writer = aom_video_writer_open(argv[2], kContainerIVF, info);
@@ -188,8 +188,10 @@ int main(int argc, char **argv) {

  info = aom_video_reader_get_info(reader);

-  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
-  if (!decoder) die("Unknown input codec.");
+  if (info->codec_fourcc == LST_FOURCC)
+    decoder = get_aom_decoder_by_fourcc(AV1_FOURCC);
+  else
+    die("Unknown input codec.");
  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));

  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
@@ -397,6 +397,10 @@ static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
  for (i = 0; i < reference_image_num; i++) aom_img_free(&reference_images[i]);

  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  // Modify large_scale_file fourcc.
+  if (cfg->large_scale_tile == 1)
+    aom_video_writer_set_fourcc(writer, LST_FOURCC);
  aom_video_writer_close(writer);

  printf("\nSecond pass complete. Processed %d frames.\n", frame_count);
@@ -287,67 +287,6 @@ void AV1FwdTxfm2dMatchTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) {
    }
  }
 }
-void AV1FwdTxfm2dSpeedTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) {
-  TxfmParam param;
-  memset(&param, 0, sizeof(param));
-  const int rows = tx_size_high[tx_size];
-  const int cols = tx_size_wide[tx_size];
-  const int num_loops = 1000000 / (rows * cols);
-
-  for (int i = 0; i < 2; ++i) {
-    const int bd = 8;
-    for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
-      if (libaom_test::IsTxSizeTypeValid(
-              tx_size, static_cast<TX_TYPE>(tx_type)) == false) {
-        continue;
-      }
-
-      FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size];
-      if (ref_func != NULL) {
-        DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 };
-        DECLARE_ALIGNED(32, int32_t, output[64 * 64]);
-        DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]);
-        int input_stride = 64;
-        ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-        for (int r = 0; r < rows; ++r) {
-          for (int c = 0; c < cols; ++c) {
-            input[r * input_stride + c] = rnd.Rand16() % (1 << bd);
-          }
-        }
-
-        param.tx_type = (TX_TYPE)tx_type;
-        param.tx_size = (TX_SIZE)tx_size;
-        param.tx_set_type = EXT_TX_SET_ALL16;
-        param.bd = bd;
-
-        aom_usec_timer ref_timer, test_timer;
-
-        aom_usec_timer_start(&ref_timer);
-        for (int i = 0; i < num_loops; ++i) {
-          ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd);
-        }
-        aom_usec_timer_mark(&ref_timer);
-        const int elapsed_time_c =
-            static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
-
-        aom_usec_timer_start(&test_timer);
-        for (int i = 0; i < num_loops; ++i) {
-          target_func(input, output, input_stride, &param);
-        }
-        aom_usec_timer_mark(&test_timer);
-        const int elapsed_time_simd =
-            static_cast<int>(aom_usec_timer_elapsed(&test_timer));
-
-        printf(
-            "txfm_size[%d] \t txfm_type[%d] \t c_time=%d \t simd_time=%d \t "
-            "gain=%d \n",
-            tx_size, tx_type, elapsed_time_c, elapsed_time_simd,
-            (elapsed_time_c / elapsed_time_simd));
-      }
-    }
-  }
-}

 typedef ::testing::tuple<TX_SIZE, lowbd_fwd_txfm_func> LbdFwdTxfm2dParam;

@@ -356,9 +295,7 @@ class AV1FwdTxfm2dTest : public ::testing::TestWithParam<LbdFwdTxfm2dParam> {};
 TEST_P(AV1FwdTxfm2dTest, match) {
  AV1FwdTxfm2dMatchTest(GET_PARAM(0), GET_PARAM(1));
 }
-TEST_P(AV1FwdTxfm2dTest, DISABLED_Speed) {
-  AV1FwdTxfm2dSpeedTest(GET_PARAM(0), GET_PARAM(1));
-}
+
 using ::testing::Combine;
 using ::testing::Values;
 using ::testing::ValuesIn;
@@ -411,6 +411,9 @@ INSTANTIATE_TEST_CASE_P(
                                 TX_16X16, TYPE_B, AOM_BITS_8),
                      make_tuple(&aom_quantize_b_32x32_c,
                                 &aom_quantize_b_32x32_ssse3, TX_32X32, TYPE_B,
+                                 AOM_BITS_8),
+                      make_tuple(&aom_quantize_b_64x64_c,
+                                 &aom_quantize_b_64x64_ssse3, TX_64X64, TYPE_B,
                                 AOM_BITS_8)));

 #endif  // HAVE_SSSE3 && ARCH_X86_64
@@ -297,7 +297,7 @@ class ResizeInternalTestLarge : public ResizeTest {

  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
    if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
-    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.5);
+    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 3.0);
  }

 #if WRITE_COMPRESSED_STREAM