From d1e08375336085474c80b96c1a55a0cc08b69678 Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Fri, 20 Dec 2024 12:01:20 +0800
Subject: [PATCH 1/3] avfilter/tonemapx: add dovi to hdr10 support

This adds a reshape-only mode for Dolby Vision videos without
a compatibility layer. In this mode, only Dolby Vision reshaping
will be performed, and the output will still be in SMPTE 2084
transfer.

The GPU-based filters already support this mode. This will be
useful in the future when we implement HDR transcoding.
---
 ...0-add-simd-optimized-tonemapx-filter.patch | 2709 ++++++++++++-----
 1 file changed, 1914 insertions(+), 795 deletions(-)

diff --git a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch
index 96798a25c9..10ad3f9ee6 100644
--- a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch
+++ b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch
@@ -95,7 +95,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
-@@ -0,0 +1,2022 @@
+@@ -0,0 +1,2366 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
 + *
@@ -1684,12 +1684,12 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
 +}
 +
-+void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                        const int *dstlinesize, const int *srclinesize,
-+                                        int dstdepth, int srcdepth,
-+                                        int width, int height,
-+                                        const struct TonemapIntParams *params)
++void tonemap_frame_dovi_2_420hdr_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                      const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                      const int *dstlinesize, const int *srclinesize,
++                                      int dstdepth, int srcdepth,
++                                      int width, int height,
++                                      const struct TonemapIntParams *params)
 +{
 +#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
 +    uint16_t *rdsty = dsty;
@@ -1703,12 +1703,6 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    // intentionally leave last pixel emtpy when input is odd
 +    int remainw = width & 6;
 +
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
-+
 +    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
 +    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
 +    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
@@ -1718,10 +1712,6 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[8], g[8], b[8];
-+    int16_t r1[8], g1[8], b1[8];
-+    uint16x8_t in_yuv_offx8 = vdupq_n_u16(params->in_yuv_off);
-+    uint16x8_t in_uv_offx8 = vdupq_n_u16(512);
 +    uint16x4_t ux4, vx4;
 +    uint16x8_t y0x8, y1x8, ux8, vx8;
 +    uint16x8_t r0x8, g0x8, b0x8;
@@ -1742,6 +1732,12 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    int32x4_t out_rndx4 = vdupq_n_s32(TEN_BIT_ROUNDING);
 +    int32x4_t out_uv_offsetx4 = vdupq_n_s32(TEN_BIT_UV_OFFSET);
 +    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(CHROMA_AVG_ROUNDING);
++    float32x4_t ipt0, ipt1, ipt2, ipt3;
++    float32x4_t ia1, ib1, ia2, ib2;
++    float32x4_t ix4, px4, tx4;
++    float32x4_t lx4, mx4, sx4;
++    float32x4_t rx4a, gx4a, bx4a, rx4b, gx4b, bx4b;
++    float32x4_t y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
 +                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
@@ -1752,31 +1748,170 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
 +            ux4 = vld1_u16(srcu + (x >> 1));
 +            vx4 = vld1_u16(srcv + (x >> 1));
-+            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
-+            y0x8 = vreinterpretq_u16_s16(vmaxq_s16(vreinterpretq_s16_u16(y0x8), vdupq_n_s16(0)));
-+            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
-+            y1x8 = vreinterpretq_u16_s16(vmaxq_s16(vreinterpretq_s16_u16(y1x8), vdupq_n_s16(0)));
 +
 +            ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4));
-+            ux8 = vsubq_u16(ux8, in_uv_offx8);
 +            vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4));
-+            vx8 = vsubq_u16(vx8, in_uv_offx8);
 +
-+            yuv2rgbx8(&r0x8, &g0x8, &b0x8, y0x8, ux8, vx8, cy, crv, cgu, cgv, cbu);
-+            yuv2rgbx8(&r1x8, &g1x8, &b1x8, y1x8, ux8, vx8, cy, crv, cgu, cgv, cbu);
++            y0x4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(y0x8)));
++            y0x4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(y0x8)));
++            y1x4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(y1x8)));
++            y1x4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(y1x8)));
 +
-+            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
-+                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                 params->rgb2rgb_passthrough);
-+            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
-+                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                 params->rgb2rgb_passthrough);
++            ux4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(ux8)));
++            ux4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(ux8)));
++            vx4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vx8)));
++            vx4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vx8)));
 +
-+            r0ox8 = vld1q_s16(r);
-+            g0ox8 = vld1q_s16(g);
-+            b0ox8 = vld1q_s16(b);
++            y0x4a = vdivq_f32(y0x4a, vdupq_n_f32(TEN_BIT_SCALE));
++            y0x4b = vdivq_f32(y0x4b, vdupq_n_f32(TEN_BIT_SCALE));
++            y1x4a = vdivq_f32(y1x4a, vdupq_n_f32(TEN_BIT_SCALE));
++            y1x4b = vdivq_f32(y1x4b, vdupq_n_f32(TEN_BIT_SCALE));
++            ux4a = vdivq_f32(ux4a, vdupq_n_f32(TEN_BIT_SCALE));
++            ux4b = vdivq_f32(ux4b, vdupq_n_f32(TEN_BIT_SCALE));
++            vx4a = vdivq_f32(vx4a, vdupq_n_f32(TEN_BIT_SCALE));
++            vx4b = vdivq_f32(vx4b, vdupq_n_f32(TEN_BIT_SCALE));
++
++            // Reshape y0x4a
++            ia1 = vzip1q_f32(y0x4a, ux4a);
++            ia2 = vzip2q_f32(y0x4a, ux4a);
++            ib1 = vzip1q_f32(vx4a, vdupq_n_f32(0.0f));
++            ib2 = vzip2q_f32(vx4a, vdupq_n_f32(0.0f));
++            ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1));
++            ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1));
++            ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2));
++            ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ia1 = vtrn1q_f32(ipt0, ipt1);
++            ia2 = vtrn1q_f32(ipt2, ipt3);
++            ib1 = vtrn2q_f32(ipt0, ipt1);
++            ib2 = vtrn2q_f32(ipt2, ipt3);
++
++            ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2));
++            px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2));
++            tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4a = vmulq_n_f32(rx4a, JPEG_SCALE);
++            gx4a = vmulq_n_f32(gx4a, JPEG_SCALE);
++            bx4a = vmulq_n_f32(bx4a, JPEG_SCALE);
++
++            // Reshape y0x4b
++            ia1 = vzip1q_f32(y0x4b, ux4b);
++            ia2 = vzip2q_f32(y0x4b, ux4b);
++            ib1 = vzip1q_f32(vx4b, vdupq_n_f32(0.0f));
++            ib2 = vzip2q_f32(vx4b, vdupq_n_f32(0.0f));
++            ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1));
++            ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1));
++            ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2));
++            ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ia1 = vtrn1q_f32(ipt0, ipt1);
++            ia2 = vtrn1q_f32(ipt2, ipt3);
++            ib1 = vtrn2q_f32(ipt0, ipt1);
++            ib2 = vtrn2q_f32(ipt2, ipt3);
++
++            ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2));
++            px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2));
++            tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4b = vmulq_n_f32(rx4b, JPEG_SCALE);
++            gx4b = vmulq_n_f32(gx4b, JPEG_SCALE);
++            bx4b = vmulq_n_f32(bx4b, JPEG_SCALE);
++
++            r0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(rx4a)), vqmovn_u32(vcvtq_u32_f32(rx4b)));
++            g0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(gx4a)), vqmovn_u32(vcvtq_u32_f32(gx4b)));
++            b0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(bx4a)), vqmovn_u32(vcvtq_u32_f32(bx4b)));
++            r0x8 = vminq_u16(r0x8, vdupq_n_u16(INT16_MAX));
++            g0x8 = vminq_u16(g0x8, vdupq_n_u16(INT16_MAX));
++            b0x8 = vminq_u16(b0x8, vdupq_n_u16(INT16_MAX));
++
++            // Reshape y1x4a
++            ia1 = vzip1q_f32(y1x4a, ux4a);
++            ia2 = vzip2q_f32(y1x4a, ux4a);
++            ib1 = vzip1q_f32(vx4a, vdupq_n_f32(0.0f));
++            ib2 = vzip2q_f32(vx4a, vdupq_n_f32(0.0f));
++            ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1));
++            ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1));
++            ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2));
++            ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ia1 = vtrn1q_f32(ipt0, ipt1);
++            ia2 = vtrn1q_f32(ipt2, ipt3);
++            ib1 = vtrn2q_f32(ipt0, ipt1);
++            ib2 = vtrn2q_f32(ipt2, ipt3);
++
++            ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2));
++            px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2));
++            tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4a = vmulq_n_f32(rx4a, JPEG_SCALE);
++            gx4a = vmulq_n_f32(gx4a, JPEG_SCALE);
++            bx4a = vmulq_n_f32(bx4a, JPEG_SCALE);
++
++            // Reshape y1x4b
++            ia1 = vzip1q_f32(y1x4b, ux4b);
++            ia2 = vzip2q_f32(y1x4b, ux4b);
++            ib1 = vzip1q_f32(vx4b, vdupq_n_f32(0.0f));
++            ib2 = vzip2q_f32(vx4b, vdupq_n_f32(0.0f));
++            ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1));
++            ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1));
++            ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2));
++            ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ia1 = vtrn1q_f32(ipt0, ipt1);
++            ia2 = vtrn1q_f32(ipt2, ipt3);
++            ib1 = vtrn2q_f32(ipt0, ipt1);
++            ib2 = vtrn2q_f32(ipt2, ipt3);
++
++            ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2));
++            px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2));
++            tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4b = vmulq_n_f32(rx4b, JPEG_SCALE);
++            gx4b = vmulq_n_f32(gx4b, JPEG_SCALE);
++            bx4b = vmulq_n_f32(bx4b, JPEG_SCALE);
++
++            r1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(rx4a)), vqmovn_u32(vcvtq_u32_f32(rx4b)));
++            g1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(gx4a)), vqmovn_u32(vcvtq_u32_f32(gx4b)));
++            b1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(bx4a)), vqmovn_u32(vcvtq_u32_f32(bx4b)));
++            r1x8 = vminq_u16(r1x8, vdupq_n_u16(INT16_MAX));
++            g1x8 = vminq_u16(g1x8, vdupq_n_u16(INT16_MAX));
++            b1x8 = vminq_u16(b1x8, vdupq_n_u16(INT16_MAX));
++
++            r0ox8 = r0x8;
++            g0ox8 = g0x8;
++            b0ox8 = b0x8;
 +
 +            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
 +            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
@@ -1803,9 +1938,9 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +            y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4));
 +            vst1q_u16(&dsty[x], y0ox8);
 +
-+            r1ox8 = vld1q_s16(r1);
-+            g1ox8 = vld1q_s16(g1);
-+            b1ox8 = vld1q_s16(b1);
++            r1ox8 = r1x8;
++            g1ox8 = g1x8;
++            b1ox8 = b1x8;
 +
 +            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
 +            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
@@ -1884,27 +2019,29 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +        rsrcy += offset;
 +        rsrcu += offset >> 1;
 +        rsrcv += offset >> 1;
-+        tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv,
-+                                      rsrcy, rsrcu, rsrcv,
-+                                      dstlinesize, srclinesize,
-+                                      dstdepth, srcdepth,
-+                                      remainw, rheight, params);
++        tonemap_frame_dovi_2_420hdr(rdsty, rdstu, rdstv,
++                                    rsrcy, rsrcu, rsrcv,
++                                    dstlinesize, srclinesize,
++                                    dstdepth, srcdepth,
++                                    remainw, rheight, params);
 +    }
 +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
 +}
 +
-+void tonemap_frame_p010_2_p010_neon(uint16_t *dsty, uint16_t *dstuv,
-+                                    const uint16_t *srcy, const uint16_t *srcuv,
-+                                    const int *dstlinesize, const int *srclinesize,
-+                                    int dstdepth, int srcdepth,
-+                                    int width, int height,
-+                                    const struct TonemapIntParams *params)
++void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                        const int *dstlinesize, const int *srclinesize,
++                                        int dstdepth, int srcdepth,
++                                        int width, int height,
++                                        const struct TonemapIntParams *params)
 +{
 +#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
 +    uint16_t *rdsty = dsty;
-+    uint16_t *rdstuv = dstuv;
++    uint16_t *rdstu = dstu;
++    uint16_t *rdstv = dstv;
 +    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcuv = srcuv;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
 +    int rheight = height;
 +    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
@@ -1928,9 +2065,8 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    int16_t r[8], g[8], b[8];
 +    int16_t r1[8], g1[8], b1[8];
 +    uint16x8_t in_yuv_offx8 = vdupq_n_u16(params->in_yuv_off);
-+    uint16x8_t in_uv_offx8 = vdupq_n_u16(TEN_BIT_UV_OFFSET);
-+    uint16x8_t uvx8;
-+    uint16x4_t ux2a, vx2a, ux2b, vx2b;
++    uint16x8_t in_uv_offx8 = vdupq_n_u16(512);
++    uint16x4_t ux4, vx4;
 +    uint16x8_t y0x8, y1x8, ux8, vx8;
 +    uint16x8_t r0x8, g0x8, b0x8;
 +    uint16x8_t r1x8, g1x8, b1x8;
@@ -1944,7 +2080,6 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    uint16x8_t y1ox8;
 +    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
 +    int32x4_t y1oax4, y1obx4;
-+    int32x4_t uvoax4, uvobx4;
 +    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
 +    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
 +    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
@@ -1952,31 +2087,240 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    int32x4_t out_uv_offsetx4 = vdupq_n_s32(TEN_BIT_UV_OFFSET);
 +    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(CHROMA_AVG_ROUNDING);
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
-+                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
 +        for (int xx = 0; xx < width >> 3; xx++) {
 +            int x = xx << 3;
 +
 +            y0x8 = vld1q_u16(srcy + x);
 +            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
-+            uvx8 = vld1q_u16(srcuv + x);
-+            // shift to low10bits for 10bit input
-+            // shift bit has to be compile-time constant
-+            y0x8 = vshrq_n_u16(y0x8, TEN_BIT_BIPLANAR_SHIFT);
-+            y1x8 = vshrq_n_u16(y1x8, TEN_BIT_BIPLANAR_SHIFT);
-+            uvx8 = vshrq_n_u16(uvx8, TEN_BIT_BIPLANAR_SHIFT);
++            ux4 = vld1_u16(srcu + (x >> 1));
++            vx4 = vld1_u16(srcv + (x >> 1));
 +            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
 +            y0x8 = vreinterpretq_u16_s16(vmaxq_s16(vreinterpretq_s16_u16(y0x8), vdupq_n_s16(0)));
 +            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
 +            y1x8 = vreinterpretq_u16_s16(vmaxq_s16(vreinterpretq_s16_u16(y1x8), vdupq_n_s16(0)));
-+            uvx8 = vsubq_u16(uvx8, in_uv_offx8);
-+
-+            ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2);
-+            vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2);
-+            ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2);
-+            vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2);
 +
-+            ux8 = vcombine_u16(ux2a, ux2b);
++            ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4));
++            ux8 = vsubq_u16(ux8, in_uv_offx8);
++            vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4));
++            vx8 = vsubq_u16(vx8, in_uv_offx8);
++
++            yuv2rgbx8(&r0x8, &g0x8, &b0x8, y0x8, ux8, vx8, cy, crv, cgu, cgv, cbu);
++            yuv2rgbx8(&r1x8, &g1x8, &b1x8, y1x8, ux8, vx8, cy, crv, cgu, cgv, cbu);
++
++            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++
++            r0ox8 = vld1q_s16(r);
++            g0ox8 = vld1q_s16(g);
++            b0ox8 = vld1q_s16(b);
++
++            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
++            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
++            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
++
++            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
++            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
++            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
++
++            y0oax4 = vmulq_n_s32(r0oax4, cry);
++            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
++            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
++            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
++            y0oax4 = vshrq_n_s32(y0oax4, TEN_BIT_SCALE_SHIFT);
++            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
++
++            y0obx4 = vmulq_n_s32(r0obx4, cry);
++            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
++            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
++            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
++            y0obx4 = vshrq_n_s32(y0obx4, TEN_BIT_SCALE_SHIFT);
++            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
++
++            y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4));
++            vst1q_u16(&dsty[x], y0ox8);
++
++            r1ox8 = vld1q_s16(r1);
++            g1ox8 = vld1q_s16(g1);
++            b1ox8 = vld1q_s16(b1);
++
++            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
++            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
++            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
++
++            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
++            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
++            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
++
++            y1oax4 = vmulq_n_s32(r1oax4, cry);
++            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
++            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
++            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
++            y1oax4 = vshrq_n_s32(y1oax4, TEN_BIT_SCALE_SHIFT);
++            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
++
++            y1obx4 = vmulq_n_s32(r1obx4, cry);
++            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
++            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
++            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
++            y1obx4 = vshrq_n_s32(y1obx4, TEN_BIT_SCALE_SHIFT);
++            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
++
++            y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4));
++            vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8);
++
++            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
++            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
++            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
++            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
++            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
++            ravgx4 = vshrq_n_s32(ravgx4, CHROMA_AVG_ROUNDING);
++
++            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
++            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
++            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
++            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
++            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
++            gavgx4 = vshrq_n_s32(gavgx4, CHROMA_AVG_ROUNDING);
++
++            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
++            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
++            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
++            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
++            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
++            bavgx4 = vshrq_n_s32(bavgx4, CHROMA_AVG_ROUNDING);
++
++            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
++            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
++            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
++            uox4 = vshrq_n_s32(uox4, TEN_BIT_SCALE_SHIFT);
++            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++            vst1_u16(&dstu[x >> 1], vqmovun_s32(uox4));
++
++            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
++            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
++            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
++            vox4 = vshrq_n_s32(vox4, TEN_BIT_SCALE_SHIFT);
++            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
++            vst1_u16(&dstv[x >> 1], vqmovun_s32(vox4));
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff8;
++        rdsty += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
++        rsrcy += offset;
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv,
++                                      rsrcy, rsrcu, rsrcv,
++                                      dstlinesize, srclinesize,
++                                      dstdepth, srcdepth,
++                                      remainw, rheight, params);
++    }
++#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
++}
++
++void tonemap_frame_p010_2_p010_neon(uint16_t *dsty, uint16_t *dstuv,
++                                    const uint16_t *srcy, const uint16_t *srcuv,
++                                    const int *dstlinesize, const int *srclinesize,
++                                    int dstdepth, int srcdepth,
++                                    int width, int height,
++                                    const struct TonemapIntParams *params)
++{
++#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstuv = dstuv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcuv = srcuv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 6;
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
++    uint16x8_t in_yuv_offx8 = vdupq_n_u16(params->in_yuv_off);
++    uint16x8_t in_uv_offx8 = vdupq_n_u16(TEN_BIT_UV_OFFSET);
++    uint16x8_t uvx8;
++    uint16x4_t ux2a, vx2a, ux2b, vx2b;
++    uint16x8_t y0x8, y1x8, ux8, vx8;
++    uint16x8_t r0x8, g0x8, b0x8;
++    uint16x8_t r1x8, g1x8, b1x8;
++
++    int16x8_t r0ox8, g0ox8, b0ox8;
++    uint16x8_t y0ox8;
++    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
++    int32x4_t y0oax4, y0obx4;
++
++    int16x8_t r1ox8, g1ox8, b1ox8;
++    uint16x8_t y1ox8;
++    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    int32x4_t y1oax4, y1obx4;
++    int32x4_t uvoax4, uvobx4;
++    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
++    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
++    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
++    int32x4_t out_rndx4 = vdupq_n_s32(TEN_BIT_ROUNDING);
++    int32x4_t out_uv_offsetx4 = vdupq_n_s32(TEN_BIT_UV_OFFSET);
++    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(CHROMA_AVG_ROUNDING);
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
++
++            y0x8 = vld1q_u16(srcy + x);
++            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
++            uvx8 = vld1q_u16(srcuv + x);
++            // shift to low10bits for 10bit input
++            // shift bit has to be compile-time constant
++            y0x8 = vshrq_n_u16(y0x8, TEN_BIT_BIPLANAR_SHIFT);
++            y1x8 = vshrq_n_u16(y1x8, TEN_BIT_BIPLANAR_SHIFT);
++            uvx8 = vshrq_n_u16(uvx8, TEN_BIT_BIPLANAR_SHIFT);
++            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
++            y0x8 = vreinterpretq_u16_s16(vmaxq_s16(vreinterpretq_s16_u16(y0x8), vdupq_n_s16(0)));
++            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
++            y1x8 = vreinterpretq_u16_s16(vmaxq_s16(vreinterpretq_s16_u16(y1x8), vdupq_n_s16(0)));
++            uvx8 = vsubq_u16(uvx8, in_uv_offx8);
++
++            ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2);
++            vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2);
++            ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2);
++            vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2);
++
++            ux8 = vcombine_u16(ux2a, ux2b);
 +            vx8 = vcombine_u16(vx2a, vx2b);
 +
 +            yuv2rgbx8(&r0x8, &g0x8, &b0x8, y0x8, ux8, vx8, cy, crv, cgu, cgv, cbu);
@@ -2122,7 +2466,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
-@@ -0,0 +1,68 @@
+@@ -0,0 +1,75 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
 + *
@@ -2176,6 +2520,13 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
 +                                      int width, int height,
 +                                      const struct TonemapIntParams *params);
 +
++void tonemap_frame_dovi_2_420hdr_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                      const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                      const int *dstlinesize, const int *srclinesize,
++                                      int dstdepth, int srcdepth,
++                                      int width, int height,
++                                      const struct TonemapIntParams *params);
++
 +void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
 +                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
 +                                        const int *dstlinesize, const int *srclinesize,
@@ -2271,7 +2622,7 @@ Index: FFmpeg/libavfilter/colorspace.h
 ===================================================================
 --- FFmpeg.orig/libavfilter/colorspace.h
 +++ FFmpeg/libavfilter/colorspace.h
-@@ -85,4 +85,8 @@ float eotf_arib_b67(float x);
+@@ -109,4 +109,8 @@ float eotf_arib_b67(float x);
  float inverse_eotf_arib_b67(float x);
  float inverse_eotf_bt1886(float x);
  
@@ -2284,7 +2635,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/vf_tonemapx.c
-@@ -0,0 +1,1799 @@
+@@ -0,0 +1,1881 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -3354,7 +3705,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    }
 +}
 +
-+void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++void tonemap_frame_dovi_2_420hdr(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
 +                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
 +                                 const int *dstlinesize, const int *srclinesize,
 +                                 int dstdepth, int srcdepth,
@@ -3362,21 +3713,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +                                 const struct TonemapIntParams *params)
 +{
 +    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
-+
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
 +
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
-+
 +    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
 +    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
 +    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
@@ -3391,19 +3732,90 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    int r10, g10, b10;
 +    int r11, g11, b11;
 +
++    const float in_rng = (float)((1 << in_depth) - 1);
++
 +    int16_t r[4], g[4], b[4];
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
-+                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
 +        for (int x = 0; x < width; x += 2) {
-+            int y00 = (srcy[x]                         ) - params->in_yuv_off;
-+            int y01 = (srcy[x + 1]                     ) - params->in_yuv_off;
-+            int y10 = (srcy[srclinesize[0] / 2 + x]    ) - params->in_yuv_off;
-+            int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off;
-+            int u = (srcu[x >> 1]) - in_uv_offset;
-+            int v = (srcv[x >> 1]) - in_uv_offset;
++            int y00 = (srcy[x]                         );
++            int y01 = (srcy[x + 1]                     );
++            int y10 = (srcy[srclinesize[0] / 2 + x]    );
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1]);
++            int u = (srcu[x >> 1]);
++            int v = (srcv[x >> 1]);
 +
-+            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
++            dovi2rgb(y00, y01, y10, y11, u, v, params, in_rng, r, g, b);
++
++            r00 = r[0], g00 = g[0], b00 = b[0];
++            r01 = r[1], g01 = g[1], b01 = b[1];
++            r10 = r[2], g10 = g[2], b10 = b[2];
++            r11 = r[3], g11 = g[3], b11 = b[3];
++
++            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)), 10);
++            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)), 10);
++            dsty[dstlinesize[0] / 2 + x]     = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)), 10);
++            dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)), 10);
++
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++            dstu[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)), 10);
++            dstv[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)), 10);
++#undef AVG
++        }
++    }
++}
++
++void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                 const int *dstlinesize, const int *srclinesize,
++                                 int dstdepth, int srcdepth,
++                                 int width, int height,
++                                 const struct TonemapIntParams *params)
++{
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int r00, g00, b00;
++    int r01, g01, b01;
++    int r10, g10, b10;
++    int r11, g11, b11;
++
++    int16_t r[4], g[4], b[4];
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                         ) - params->in_yuv_off;
++            int y01 = (srcy[x + 1]                     ) - params->in_yuv_off;
++            int y10 = (srcy[srclinesize[0] / 2 + x]    ) - params->in_yuv_off;
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off;
++            int u = (srcu[x >> 1]) - in_uv_offset;
++            int v = (srcv[x >> 1]) - in_uv_offset;
++
++            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
 +            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
 +            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
 +            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
@@ -3804,8 +4216,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +
 +    av_frame_free(&in);
 +
-+    av_frame_remove_side_data(out, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA);
-+    av_frame_remove_side_data(out, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL);
++    if (s->trc !=AVCOL_TRC_SMPTE2084) {
++        av_frame_remove_side_data(out, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA);
++        av_frame_remove_side_data(out, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL);
++    }
++
 +    av_frame_remove_side_data(out, AV_FRAME_DATA_DOVI_RPU_BUFFER);
 +    av_frame_remove_side_data(out, AV_FRAME_DATA_DOVI_METADATA);
 +
@@ -3896,6 +4311,23 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    av_log(s, AV_LOG_DEBUG, "Requested output format: %s\n",
 +           s->format_str);
 +
++    if (s->trc == AVCOL_TRC_SMPTE2084) {
++        if (s->spc != AVCOL_SPC_BT2020_NCL) {
++            av_log(s, AV_LOG_ERROR, "HDR passthrough requires BT2020 Non-constant luminance matrix\n");
++            return AVERROR(EINVAL);
++        }
++
++        if (s->pri != AVCOL_PRI_BT2020) {
++            av_log(s, AV_LOG_ERROR, "HDR passthrough requires BT2020 primaries\n");
++            return AVERROR(EINVAL);
++        }
++
++        if (!s->apply_dovi) {
++            av_log(s, AV_LOG_ERROR, "HDR passthrough only works for Dolby Vision inputs at the moment\n");
++            return AVERROR(EINVAL);
++        }
++    }
++
 +#if ARCH_AARCH64
 +#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
 +    {
@@ -3906,7 +4338,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_neon;
 +            s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_neon;
 +            s->tonemap_func_dovi8 = tonemap_frame_dovi_2_420p_neon;
-+            s->tonemap_func_dovi10 = tonemap_frame_dovi_2_420p10_neon;
++            s->tonemap_func_dovi10 = s->trc == AVCOL_TRC_SMPTE2084 ? tonemap_frame_dovi_2_420hdr_neon : tonemap_frame_dovi_2_420p10_neon;
 +            active_simd = SIMD_NEON;
 +        }
 +    }
@@ -3923,7 +4355,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_sse;
 +            s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_sse;
 +            s->tonemap_func_dovi8 = tonemap_frame_dovi_2_420p_sse;
-+            s->tonemap_func_dovi10 = tonemap_frame_dovi_2_420p10_sse;
++            s->tonemap_func_dovi10 = s->trc == AVCOL_TRC_SMPTE2084 ? tonemap_frame_dovi_2_420hdr_sse : tonemap_frame_dovi_2_420p10_sse;
 +            active_simd = SIMD_SSE;
 +        }
 +    }
@@ -3939,7 +4371,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_avx;
 +            s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_avx;
 +            s->tonemap_func_dovi8 = tonemap_frame_dovi_2_420p_avx;
-+            s->tonemap_func_dovi10 = tonemap_frame_dovi_2_420p10_avx;
++            s->tonemap_func_dovi10 = s->trc == AVCOL_TRC_SMPTE2084 ? tonemap_frame_dovi_2_420hdr_avx : tonemap_frame_dovi_2_420p10_avx;
 +            active_simd = SIMD_AVX;
 +        }
 +    }
@@ -3975,7 +4407,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    }
 +
 +    if (!s->tonemap_func_dovi10) {
-+        s->tonemap_func_dovi10 = tonemap_frame_dovi_2_420p10;
++        s->tonemap_func_dovi10 = s->trc == AVCOL_TRC_SMPTE2084 ? tonemap_frame_dovi_2_420hdr : tonemap_frame_dovi_2_420p10;
 +    }
 +
 +    switch (active_simd) {
@@ -4040,6 +4472,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    { "t",            "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, .unit = "transfer" },
 +    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709},           0, 0, FLAGS, .unit = "transfer" },
 +    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10},       0, 0, FLAGS, .unit = "transfer" },
++    {     "smpte2084",0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTE2084},       0, 0, FLAGS, .unit = "transfer" },
 +    { "matrix",       "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, .unit = "matrix" },
 +    { "m",            "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, .unit = "matrix" },
 +    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709},           0, 0, FLAGS, .unit = "matrix" },
@@ -4088,7 +4521,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.h
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/vf_tonemapx.h
-@@ -0,0 +1,137 @@
+@@ -0,0 +1,144 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -4211,6 +4644,13 @@ Index: FFmpeg/libavfilter/vf_tonemapx.h
 +                                 int width, int height,
 +                                 const struct TonemapIntParams *params);
 +
++void tonemap_frame_dovi_2_420hdr(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                 const int *dstlinesize, const int *srclinesize,
++                                 int dstdepth, int srcdepth,
++                                 int width, int height,
++                                 const struct TonemapIntParams *params);
++
 +void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
 +                                   const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
 +                                   const int *dstlinesize, const int *srclinesize,
@@ -4243,7 +4683,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
-@@ -0,0 +1,2289 @@
+@@ -0,0 +1,2584 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
 + *
@@ -5331,7 +5771,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
 +}
 +
-+X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++X86_64_V3 void tonemap_frame_dovi_2_420hdr_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
 +                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
 +                                               const int *dstlinesize, const int *srclinesize,
 +                                               int dstdepth, int srcdepth,
@@ -5339,33 +5779,25 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +                                               const struct TonemapIntParams *params)
 +{
 +#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
-+    uint8_t *rdsty = dsty;
-+    uint8_t *rdstu = dstu;
-+    uint8_t *rdstv = dstv;
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstu = dstu;
++    uint16_t *rdstv = dstv;
 +    const uint16_t *rsrcy = srcy;
 +    const uint16_t *rsrcu = srcu;
 +    const uint16_t *rsrcv = srcv;
 +    int rheight = height;
-+    // not zero when not divisible by 16
++    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
 +    int remainw = width & 14;
 +
 +    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
++    const float in_rng = (float)((1 << in_depth) - 1);
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
 +
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
-+
 +    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
 +    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
 +    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
@@ -5375,32 +5807,29 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[16], g[16], b[16];
-+    int16_t r1[16], g1[16], b1[16];
-+    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
-+    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
-+    __m256i cyx8 = _mm256_set1_epi32(cy);
-+    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
-+
 +    __m256i ux8, vx8;
 +    __m256i y0x16, y1x16;
 +    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
 +    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
 +    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
 +
-+    __m256i r0ox16, g0ox16, b0ox16;
 +    __m256i y0ox16;
 +    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
 +    __m256i yoax8, yobx8;
 +
-+    __m256i r1ox16, g1ox16, b1ox16;
 +    __m256i y1ox16;
 +    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
 +    __m256i y1oax8, y1obx8;
 +    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
++
++    __m128 ipt0, ipt1, ipt2, ipt3, ipt4, ipt5, ipt6, ipt7;
++    __m256 ix8, px8, tx8;
++    __m256 lx8, mx8, sx8;
++    __m256 rx8a, gx8a, bx8a, rx8b, gx8b, bx8b;
++    __m256 y0x8af, y0x8bf, y1x8af, y1x8bf, ux8af, ux8bf, vx8af, vx8bf;
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
-+                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
 +        for (int xx = 0; xx < width >> 4; xx++) {
 +            int x = xx << 4;
 +
@@ -5414,117 +5843,128 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
 +            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
 +
-+            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
-+            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
-+            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
-+            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
-+            ux8 = _mm256_sub_epi32(ux8, in_uv_offx8);
-+            vx8 = _mm256_sub_epi32(vx8, in_uv_offx8);
-+
 +            ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
 +            ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
 +            vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
 +            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
 +
-+            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
-+            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
-+            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
-+            r0x8a = av_clip_int16_avx(r0x8a);
++            y0x8af = _mm256_cvtepi32_ps(y0x8a);
++            y0x8bf = _mm256_cvtepi32_ps(y0x8b);
++            y1x8af = _mm256_cvtepi32_ps(y1x8a);
++            y1x8bf = _mm256_cvtepi32_ps(y1x8b);
++            ux8af = _mm256_cvtepi32_ps(ux8a);
++            ux8bf = _mm256_cvtepi32_ps(ux8b);
++            vx8af = _mm256_cvtepi32_ps(vx8a);
++            vx8bf = _mm256_cvtepi32_ps(vx8b);
 +
-+            r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
-+            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
-+            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
-+            r1x8a = av_clip_int16_avx(r1x8a);
++            y0x8af = _mm256_div_ps(y0x8af, _mm256_set1_ps(in_rng));
++            y0x8bf = _mm256_div_ps(y0x8bf, _mm256_set1_ps(in_rng));
++            y1x8af = _mm256_div_ps(y1x8af, _mm256_set1_ps(in_rng));
++            y1x8bf = _mm256_div_ps(y1x8bf, _mm256_set1_ps(in_rng));
++            ux8af = _mm256_div_ps(ux8af, _mm256_set1_ps(in_rng));
++            ux8bf = _mm256_div_ps(ux8bf, _mm256_set1_ps(in_rng));
++            vx8af = _mm256_div_ps(vx8af, _mm256_set1_ps(in_rng));
++            vx8bf = _mm256_div_ps(vx8bf, _mm256_set1_ps(in_rng));
 +
-+            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
-+            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
-+            g0x8a = av_clip_int16_avx(g0x8a);
++            // Reshape y0x8a
++            reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3,
++                         &ipt4, &ipt5, &ipt6, &ipt7,
++                         y0x8af, ux8af, vx8af, params);
 +
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
-+            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
-+            g1x8a = av_clip_int16_avx(g1x8a);
++            transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3,
++                             ipt4, ipt5, ipt6, ipt7,
++                             &ix8, &px8, &tx8);
 +
-+            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
-+            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
++            ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx8(&rx8a, &gx8a, &bx8a, lx8, mx8, sx8, *params->lms2rgb_matrix);
++
++            rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(JPEG_SCALE));
++            gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(JPEG_SCALE));
++            bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(JPEG_SCALE));
++
++            r0x8a = _mm256_cvtps_epi32(rx8a);
++            r0x8a = av_clip_int16_avx(r0x8a);
++            g0x8a = _mm256_cvtps_epi32(gx8a);
++            g0x8a = av_clip_int16_avx(g0x8a);
++            b0x8a = _mm256_cvtps_epi32(bx8a);
 +            b0x8a = av_clip_int16_avx(b0x8a);
 +
-+            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
-+            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
-+            b1x8a = av_clip_int16_avx(b1x8a);
++            // Reshape y1x8a
++            reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3,
++                         &ipt4, &ipt5, &ipt6, &ipt7,
++                         y1x8af, ux8af, vx8af, params);
 +
-+            r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
-+            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
-+            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
-+            r0x8b = av_clip_int16_avx(r0x8b);
++            transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3,
++                             ipt4, ipt5, ipt6, ipt7,
++                             &ix8, &px8, &tx8);
 +
-+            r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
-+            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
-+            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
-+            r1x8b = av_clip_int16_avx(r1x8b);
++            ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx8(&rx8a, &gx8a, &bx8a, lx8, mx8, sx8, *params->lms2rgb_matrix);
 +
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
-+            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
-+            g0x8b = av_clip_int16_avx(g0x8b);
++            rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(JPEG_SCALE));
++            gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(JPEG_SCALE));
++            bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(JPEG_SCALE));
 +
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
-+            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
-+            g1x8b = av_clip_int16_avx(g1x8b);
++            r1x8a = _mm256_cvtps_epi32(rx8a);
++            r1x8a = av_clip_int16_avx(r1x8a);
++            g1x8a = _mm256_cvtps_epi32(gx8a);
++            g1x8a = av_clip_int16_avx(g1x8a);
++            b1x8a = _mm256_cvtps_epi32(bx8a);
++            b1x8a = av_clip_int16_avx(b1x8a);
 +
-+            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
-+            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
-+            b0x8b = av_clip_int16_avx(b0x8b);
++            // Reshape y0x8b
++            reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3,
++                         &ipt4, &ipt5, &ipt6, &ipt7,
++                         y0x8bf, ux8bf, vx8bf, params);
 +
-+            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
-+            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
-+            b1x8b = av_clip_int16_avx(b1x8b);
++            transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3,
++                             ipt4, ipt5, ipt6, ipt7,
++                             &ix8, &px8, &tx8);
 +
-+            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
++            ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx8(&rx8b, &gx8b, &bx8b, lx8, mx8, sx8, *params->lms2rgb_matrix);
 +
-+            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
-+            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
-+            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
++            rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(JPEG_SCALE));
++            gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(JPEG_SCALE));
++            bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(JPEG_SCALE));
 +
-+            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
-+            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
-+            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++            r0x8b = _mm256_cvtps_epi32(rx8b);
++            r0x8b = av_clip_int16_avx(r0x8b);
++            g0x8b = _mm256_cvtps_epi32(gx8b);
++            g0x8b = av_clip_int16_avx(g0x8b);
++            b0x8b = _mm256_cvtps_epi32(bx8b);
++            b0x8b = av_clip_int16_avx(b0x8b);
 +
-+            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
-+            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
-+            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++            // Reshape y1x8b
++            reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3,
++                         &ipt4, &ipt5, &ipt6, &ipt7,
++                         y1x8bf, ux8bf, vx8bf, params);
++
++            transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3,
++                             ipt4, ipt5, ipt6, ipt7,
++                             &ix8, &px8, &tx8);
++
++            ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx8(&rx8b, &gx8b, &bx8b, lx8, mx8, sx8, *params->lms2rgb_matrix);
++
++            rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(JPEG_SCALE));
++            gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(JPEG_SCALE));
++            bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(JPEG_SCALE));
++
++            r1x8b = _mm256_cvtps_epi32(rx8b);
++            r1x8b = av_clip_int16_avx(r1x8b);
++            g1x8b = _mm256_cvtps_epi32(gx8b);
++            g1x8b = av_clip_int16_avx(g1x8b);
++            b1x8b = _mm256_cvtps_epi32(bx8b);
++            b1x8b = av_clip_int16_avx(b1x8b);
++
++            roax8 = r0x8a;
++            goax8 = g0x8a;
++            boax8 = b0x8a;
++
++            robx8 = r0x8b;
++            gobx8 = g0x8b;
++            bobx8 = b0x8b;
 +
 +            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
 +            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
@@ -5540,21 +5980,17 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
 +            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            y0ox16 = _mm256_packs_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
 +            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
-+
-+            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
-+            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
-+            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
 +
-+            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
-+            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
-+            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++            r1oax8 = r1x8a;
++            g1oax8 = g1x8a;
++            b1oax8 = b1x8a;
 +
-+            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
-+            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
-+            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++            r1obx8 = r1x8b;
++            g1obx8 = g1x8b;
++            b1obx8 = b1x8b;
 +
 +            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
 +            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
@@ -5570,9 +6006,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
 +            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
 +            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
 +
 +            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
 +            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
@@ -5597,20 +6033,18 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
 +            uox8 = _mm256_srai_epi32(uox8, out_sh);
 +            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
-+            uox8 = _mm256_packs_epi32(uox8, _mm256_setzero_si256());
++            uox8 = _mm256_packus_epi32(uox8, _mm256_setzero_si256());
 +            uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0));
-+            uox8 = _mm256_packus_epi16(uox8, _mm256_setzero_si256());
-+            _mm_storeu_si64(&dstu[x >> 1], _mm256_castsi256_si128(uox8));
++            _mm_storeu_si128((__m128i_u *) &dstu[x >> 1], _mm256_castsi256_si128(uox8));
 +
 +            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
 +            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
 +            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
 +            vox8 = _mm256_srai_epi32(vox8, out_sh);
 +            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
-+            vox8 = _mm256_packs_epi32(vox8, _mm256_setzero_si256());
++            vox8 = _mm256_packus_epi32(vox8, _mm256_setzero_si256());
 +            vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0));
-+            vox8 = _mm256_packus_epi16(vox8, _mm256_setzero_si256());
-+            _mm_storeu_si64(&dstv[x >> 1], _mm256_castsi256_si128(vox8));
++            _mm_storeu_si128((__m128i_u *) &dstv[x >> 1], _mm256_castsi256_si128(vox8));
 +        }
 +    }
 +
@@ -5623,7 +6057,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +        rsrcy += offset;
 +        rsrcu += offset >> 1;
 +        rsrcv += offset >> 1;
-+        tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv,
++        tonemap_frame_dovi_2_420hdr(rdsty, rdstu, rdstv,
 +                                    rsrcy, rsrcu, rsrcv,
 +                                    dstlinesize, srclinesize,
 +                                    dstdepth, srcdepth,
@@ -5632,22 +6066,22 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
 +}
 +
-+X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                                 const int *dstlinesize, const int *srclinesize,
-+                                                 int dstdepth, int srcdepth,
-+                                                 int width, int height,
-+                                                 const struct TonemapIntParams *params)
++X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                               const int *dstlinesize, const int *srclinesize,
++                                               int dstdepth, int srcdepth,
++                                               int width, int height,
++                                               const struct TonemapIntParams *params)
 +{
 +#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
-+    uint16_t *rdsty = dsty;
-+    uint16_t *rdstu = dstu;
-+    uint16_t *rdstv = dstv;
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstu = dstu;
++    uint8_t *rdstv = dstv;
 +    const uint16_t *rsrcy = srcy;
 +    const uint16_t *rsrcu = srcu;
 +    const uint16_t *rsrcv = srcv;
 +    int rheight = height;
-+    // not zero when not divisible by 8
++    // not zero when not divisible by 16
 +    // intentionally leave last pixel emtpy when input is odd
 +    int remainw = width & 14;
 +
@@ -5683,24 +6117,25 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    __m256i cyx8 = _mm256_set1_epi32(cy);
 +    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
 +
-+    __m256i r0ox16, g0ox16, b0ox16;
-+    __m256i y0ox16;
-+    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
-+    __m256i yoax8, yobx8;
 +    __m256i ux8, vx8;
 +    __m256i y0x16, y1x16;
 +    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
 +    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
 +    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
 +
++    __m256i r0ox16, g0ox16, b0ox16;
++    __m256i y0ox16;
++    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
++    __m256i yoax8, yobx8;
++
 +    __m256i r1ox16, g1ox16, b1ox16;
 +    __m256i y1ox16;
 +    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
 +    __m256i y1oax8, y1obx8;
 +    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
-+                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
++                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
 +        for (int xx = 0; xx < width >> 4; xx++) {
 +            int x = xx << 4;
 +
@@ -5840,9 +6275,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
 +            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_packs_epi32(yoax8, yobx8);
 +            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
++            _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
 +
 +            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
 +            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
@@ -5870,9 +6305,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
 +            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8);
 +            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
++            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
 +
 +            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
 +            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
@@ -5897,18 +6332,20 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
 +            uox8 = _mm256_srai_epi32(uox8, out_sh);
 +            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
-+            uox8 = _mm256_packus_epi32(uox8, _mm256_setzero_si256());
++            uox8 = _mm256_packs_epi32(uox8, _mm256_setzero_si256());
 +            uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dstu[x >> 1], _mm256_castsi256_si128(uox8));
++            uox8 = _mm256_packus_epi16(uox8, _mm256_setzero_si256());
++            _mm_storeu_si64(&dstu[x >> 1], _mm256_castsi256_si128(uox8));
 +
 +            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
 +            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
 +            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
 +            vox8 = _mm256_srai_epi32(vox8, out_sh);
 +            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
-+            vox8 = _mm256_packus_epi32(vox8, _mm256_setzero_si256());
++            vox8 = _mm256_packs_epi32(vox8, _mm256_setzero_si256());
 +            vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dstv[x >> 1], _mm256_castsi256_si128(vox8));
++            vox8 = _mm256_packus_epi16(vox8, _mm256_setzero_si256());
++            _mm_storeu_si64(&dstv[x >> 1], _mm256_castsi256_si128(vox8));
 +        }
 +    }
 +
@@ -5921,29 +6358,31 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +        rsrcy += offset;
 +        rsrcu += offset >> 1;
 +        rsrcv += offset >> 1;
-+        tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv,
-+                                      rsrcy, rsrcu, rsrcv,
-+                                      dstlinesize, srclinesize,
-+                                      dstdepth, srcdepth,
-+                                      remainw, rheight, params);
++        tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv,
++                                    rsrcy, rsrcu, rsrcv,
++                                    dstlinesize, srclinesize,
++                                    dstdepth, srcdepth,
++                                    remainw, rheight, params);
 +    }
 +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
 +}
 +
-+X86_64_V3 void tonemap_frame_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv,
-+                                             const uint16_t *srcy, const uint16_t *srcuv,
-+                                             const int *dstlinesize, const int *srclinesize,
-+                                             int dstdepth, int srcdepth,
-+                                             int width, int height,
-+                                             const struct TonemapIntParams *params)
++X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                                 const int *dstlinesize, const int *srclinesize,
++                                                 int dstdepth, int srcdepth,
++                                                 int width, int height,
++                                                 const struct TonemapIntParams *params)
 +{
 +#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
-+    uint8_t *rdsty = dsty;
-+    uint8_t *rdstuv = dstuv;
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstu = dstu;
++    uint16_t *rdstv = dstv;
 +    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcuv = srcuv;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
 +    int rheight = height;
-+    // not zero when not divisible by 16
++    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
 +    int remainw = width & 14;
 +
@@ -5979,54 +6418,48 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    __m256i cyx8 = _mm256_set1_epi32(cy);
 +    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
 +
-+    __m256i uvx16, uvx8a, uvx8b;
-+    __m256i y0x16, y1x16;
-+    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
-+    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
-+    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
-+
 +    __m256i r0ox16, g0ox16, b0ox16;
 +    __m256i y0ox16;
 +    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
 +    __m256i yoax8, yobx8;
++    __m256i ux8, vx8;
++    __m256i y0x16, y1x16;
++    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
 +
 +    __m256i r1ox16, g1ox16, b1ox16;
 +    __m256i y1ox16;
 +    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
-+    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
++    __m256i y1oax8, y1obx8;
 +    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
-+                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
 +        for (int xx = 0; xx < width >> 4; xx++) {
 +            int x = xx << 4;
 +
 +            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
 +            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
-+            uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x));
-+
-+            // shift to low10bits for 10bit input
-+            y0x16 = _mm256_srli_epi16(y0x16, TEN_BIT_BIPLANAR_SHIFT);
-+            y1x16 = _mm256_srli_epi16(y1x16, TEN_BIT_BIPLANAR_SHIFT);
-+            uvx16 = _mm256_srli_epi16(uvx16, TEN_BIT_BIPLANAR_SHIFT);
++            ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1))));
++            vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1))));
 +
 +            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
 +            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
 +            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
 +            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
-+            uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0));
-+            uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1));
++
 +            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
 +            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
 +            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
 +            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
-+            uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8);
-+            uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8);
++            ux8 = _mm256_sub_epi32(ux8, in_uv_offx8);
++            vx8 = _mm256_sub_epi32(vx8, in_uv_offx8);
 +
-+            ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0));
-+            ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0));
-+            vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1));
-+            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
++            ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++            vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
 +            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
@@ -6142,9 +6575,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
 +            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            y0ox16 = _mm256_packs_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
 +            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
 +
 +            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
 +            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
@@ -6172,9 +6605,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
 +            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
 +            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
 +
 +            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
 +            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
@@ -6199,17 +6632,18 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
 +            uox8 = _mm256_srai_epi32(uox8, out_sh);
 +            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++            uox8 = _mm256_packus_epi32(uox8, _mm256_setzero_si256());
++            uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dstu[x >> 1], _mm256_castsi256_si128(uox8));
 +
 +            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
 +            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
 +            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
 +            vox8 = _mm256_srai_epi32(vox8, out_sh);
 +            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
-+
-+            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
-+            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
-+            uvox16 = _mm256_packs_epi32(uvoax8, uvobx8);
-+            _mm_storeu_si128((__m128i_u *) &dstuv[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(uvox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++            vox8 = _mm256_packus_epi32(vox8, _mm256_setzero_si256());
++            vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dstv[x >> 1], _mm256_castsi256_si128(vox8));
 +        }
 +    }
 +
@@ -6217,19 +6651,21 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    if (remainw) {
 +        int offset = width & (int)0xfffffff0;
 +        rdsty += offset;
-+        rdstuv += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
 +        rsrcy += offset;
-+        rsrcuv += offset;
-+        tonemap_frame_p010_2_nv12(rdsty, rdstuv,
-+                                  rsrcy, rsrcuv,
-+                                  dstlinesize, srclinesize,
-+                                  dstdepth, srcdepth,
-+                                  remainw, rheight, params);
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv,
++                                      rsrcy, rsrcu, rsrcv,
++                                      dstlinesize, srclinesize,
++                                      dstdepth, srcdepth,
++                                      remainw, rheight, params);
 +    }
 +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
 +}
 +
-+X86_64_V3 void tonemap_frame_p010_2_p010_avx(uint16_t *dsty, uint16_t *dstuv,
++X86_64_V3 void tonemap_frame_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv,
 +                                             const uint16_t *srcy, const uint16_t *srcuv,
 +                                             const int *dstlinesize, const int *srclinesize,
 +                                             int dstdepth, int srcdepth,
@@ -6237,12 +6673,12 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +                                             const struct TonemapIntParams *params)
 +{
 +#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
-+    uint16_t *rdsty = dsty;
-+    uint16_t *rdstuv = dstuv;
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstuv = dstuv;
 +    const uint16_t *rsrcy = srcy;
 +    const uint16_t *rsrcuv = srcuv;
 +    int rheight = height;
-+    // not zero when not divisible by 8
++    // not zero when not divisible by 16
 +    // intentionally leave last pixel emtpy when input is odd
 +    int remainw = width & 14;
 +
@@ -6255,7 +6691,6 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
-+    const int out_sh2 = 16 - out_depth;
 +
 +    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
 +    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
@@ -6279,23 +6714,24 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    __m256i cyx8 = _mm256_set1_epi32(cy);
 +    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
 +
-+    __m256i r0ox16, g0ox16, b0ox16;
-+    __m256i y0ox16;
-+    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
-+    __m256i yoax8, yobx8;
 +    __m256i uvx16, uvx8a, uvx8b;
 +    __m256i y0x16, y1x16;
 +    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
 +    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
 +    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
 +
-+    __m256i r1ox16, g1ox16, b1ox16;
++    __m256i r0ox16, g0ox16, b0ox16;
++    __m256i y0ox16;
++    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
++    __m256i yoax8, yobx8;
++
++    __m256i r1ox16, g1ox16, b1ox16;
 +    __m256i y1ox16;
 +    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
 +    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
 +    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
++                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
 +                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
 +        for (int xx = 0; xx < width >> 4; xx++) {
 +            int x = xx << 4;
@@ -6433,7 +6869,6 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
 +            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
 +            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
-+            yoax8 = _mm256_slli_epi32(yoax8, out_sh2);
 +
 +            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
 +            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
@@ -6441,11 +6876,10 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
 +            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
 +            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
-+            yobx8 = _mm256_slli_epi32(yobx8, out_sh2);
 +
-+            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_packs_epi32(yoax8, yobx8);
 +            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
++            _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
 +
 +            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
 +            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
@@ -6465,7 +6899,6 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
 +            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
 +            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
-+            y1oax8 = _mm256_slli_epi32(y1oax8, out_sh2);
 +
 +            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
 +            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
@@ -6473,11 +6906,10 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
 +            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
 +            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
-+            y1obx8 = _mm256_slli_epi32(y1obx8, out_sh2);
 +
-+            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8);
 +            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
++            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
 +
 +            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
 +            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
@@ -6511,10 +6943,8 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +
 +            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
 +            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
-+            uvoax8 = _mm256_slli_epi32(uvoax8, out_sh2);
-+            uvobx8 = _mm256_slli_epi32(uvobx8, out_sh2);
-+            uvox16 = _mm256_packus_epi32(uvoax8, uvobx8);
-+            _mm256_storeu_si256((__m256i_u *) &dstuv[x], uvox16);
++            uvox16 = _mm256_packs_epi32(uvoax8, uvobx8);
++            _mm_storeu_si128((__m128i_u *) &dstuv[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(uvox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
 +        }
 +    }
 +
@@ -6525,7 +6955,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +        rdstuv += offset;
 +        rsrcy += offset;
 +        rsrcuv += offset;
-+        tonemap_frame_p010_2_p010(rdsty, rdstuv,
++        tonemap_frame_p010_2_nv12(rdsty, rdstuv,
 +                                  rsrcy, rsrcuv,
 +                                  dstlinesize, srclinesize,
 +                                  dstdepth, srcdepth,
@@ -6533,484 +6963,1197 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    }
 +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
 +}
-Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
-===================================================================
---- /dev/null
-+++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
-@@ -0,0 +1,68 @@
-+/*
-+ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVFILTER_X86_TONEMAPX_INTRIN_AVX_H
-+#define AVFILTER_X86_TONEMAPX_INTRIN_AVX_H
-+
-+#include "libavfilter/vf_tonemapx.h"
-+
-+X86_64_V3 void tonemap_frame_dovi_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                             const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                             const int *dstlinesize, const int *srclinesize,
-+                                             int dstdepth, int srcdepth,
-+                                             int width, int height,
-+                                             const struct TonemapIntParams *params);
-+
-+X86_64_V3 void tonemap_frame_dovi_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                               const int *dstlinesize, const int *srclinesize,
-+                                               int dstdepth, int srcdepth,
-+                                               int width, int height,
-+                                               const struct TonemapIntParams *params);
-+
-+X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                               const int *dstlinesize, const int *srclinesize,
-+                                               int dstdepth, int srcdepth,
-+                                               int width, int height,
-+                                               const struct TonemapIntParams *params);
-+
-+X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                                 const int *dstlinesize, const int *srclinesize,
-+                                                 int dstdepth, int srcdepth,
-+                                                 int width, int height,
-+                                                 const struct TonemapIntParams *params);
-+
-+X86_64_V3 void tonemap_frame_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv,
-+                                             const uint16_t *srcy, const uint16_t *srcuv,
-+                                             const int *dstlinesize, const int *srclinesize,
-+                                             int dstdepth, int srcdepth,
-+                                             int width, int height,
-+                                             const struct TonemapIntParams *params);
 +
 +X86_64_V3 void tonemap_frame_p010_2_p010_avx(uint16_t *dsty, uint16_t *dstuv,
 +                                             const uint16_t *srcy, const uint16_t *srcuv,
 +                                             const int *dstlinesize, const int *srclinesize,
 +                                             int dstdepth, int srcdepth,
 +                                             int width, int height,
-+                                             const struct TonemapIntParams *params);
++                                             const struct TonemapIntParams *params)
++{
++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstuv = dstuv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcuv = srcuv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 14;
 +
-+#endif // AVFILTER_X86_TONEMAPX_INTRIN_AVX_H
-Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
-===================================================================
---- /dev/null
-+++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
-@@ -0,0 +1,2370 @@
-+/*
-+ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
 +
-+#include "vf_tonemapx_intrin_sse.h"
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++    const int out_sh2 = 16 - out_depth;
 +
-+#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
-+#    include <immintrin.h>
-+#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
 +
-+#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
-+// GCC 10 and below does not implement _mm_storeu_si32 with movd instruction
-+// cast the register into float register and store with movss as a workaround
-+#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
-+__attribute__((always_inline))
-+X86_64_V2 static inline void _mm_storeu_si32(void* mem_addr, __m128i a) {
-+    _mm_store_ss((float*)mem_addr, _mm_castsi128_ps(a));
-+    return;
-+}
-+#endif
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+X86_64_V2 static inline __m128i av_clip_uint16_sse(__m128i a)
-+{
-+    __m128i mask = _mm_set1_epi32(0x7FFF);
-+    __m128i condition = _mm_and_si128(a, _mm_set1_epi32(~0x7FFF));
++    int16_t r[16], g[16], b[16];
++    int16_t r1[16], g1[16], b1[16];
++    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
++    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
++    __m256i cyx8 = _mm256_set1_epi32(cy);
++    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
 +
-+    __m128i zero = _mm_setzero_si128();
-+    __m128i cmp = _mm_cmpeq_epi32(condition, zero);
++    __m256i r0ox16, g0ox16, b0ox16;
++    __m256i y0ox16;
++    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
++    __m256i yoax8, yobx8;
++    __m256i uvx16, uvx8a, uvx8b;
++    __m256i y0x16, y1x16;
++    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
 +
-+    __m128i neg_a = _mm_and_si128(_mm_srai_epi32(_mm_xor_si128(a, _mm_set1_epi32(-1)), 31), mask);
-+    __m128i result = _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, neg_a));
++    __m256i r1ox16, g1ox16, b1ox16;
++    __m256i y1ox16;
++    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
++    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
++    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 4; xx++) {
++            int x = xx << 4;
 +
-+    return result;
-+}
++            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
++            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
++            uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x));
 +
-+X86_64_V2 static inline __m128i av_clip_int16_sse(__m128i a)
-+{
-+    __m128i add_result = _mm_add_epi32(a, _mm_set1_epi32(0x8000U));
-+    __m128i mask = _mm_set1_epi32(~0xFFFF);
-+    __m128i condition = _mm_and_si128(add_result, mask);
-+    __m128i cmp = _mm_cmpeq_epi32(condition, _mm_setzero_si128());
++            // shift to low10bits for 10bit input
++            y0x16 = _mm256_srli_epi16(y0x16, TEN_BIT_BIPLANAR_SHIFT);
++            y1x16 = _mm256_srli_epi16(y1x16, TEN_BIT_BIPLANAR_SHIFT);
++            uvx16 = _mm256_srli_epi16(uvx16, TEN_BIT_BIPLANAR_SHIFT);
 +
-+    __m128i shifted = _mm_srai_epi32(a, 31);
-+    __m128i xor_result = _mm_xor_si128(shifted, _mm_set1_epi32(0x7FFF));
++            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
++            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
++            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
++            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
++            uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0));
++            uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1));
++            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
++            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
++            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
++            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
++            uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8);
++            uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8);
 +
-+    return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, xor_result));
-+}
++            ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0));
++            ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0));
++            vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1));
++            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
 +
-+X86_64_V2 inline static __m128 mix_float32x4(__m128 x, __m128 y, __m128 a)
-+{
-+    __m128 n = _mm_sub_ps(y, x);
-+    n = _mm_mul_ps(n, a);
-+    n = _mm_add_ps(n, x);
-+    return n;
-+}
++            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
++            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
++            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
++            r0x8a = av_clip_int16_avx(r0x8a);
 +
-+X86_64_V2 inline static float reduce_floatx4(__m128 x) {
-+    x = _mm_hadd_ps(x, x);
-+    x = _mm_hadd_ps(x, x);
-+    return _mm_cvtss_f32(x);
-+}
++            r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
++            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
++            r1x8a = av_clip_int16_avx(r1x8a);
 +
-+X86_64_V2 static inline float reshape_poly(float s, __m128 coeffs)
-+{
-+    __m128 ps = _mm_set_ps(0.0f, s * s, s, 1.0f);
-+    ps = _mm_mul_ps(ps, coeffs);
-+    return reduce_floatx4(ps);
-+}
++            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
++            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
++            g0x8a = av_clip_int16_avx(g0x8a);
 +
-+X86_64_V2 inline static float reshape_mmr(__m128 sig, __m128 coeffs, const float* mmr,
-+                                          int mmr_single, int min_order, int max_order)
-+{
-+    float s = _mm_cvtss_f32(coeffs);
-+    int mmr_idx = 0;
-+    int order = 0;
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
++            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
++            g1x8a = av_clip_int16_avx(g1x8a);
 +
-+    __m128 mmr_coeffs, ps;
-+    __m128 sigX01 = _mm_mul_ps(sig, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(1, 1, 1, 1))); // {sig[0]*sig[1], sig[1]*sig[1], sig[2]*sig[1], sig[3]*sig[1]}
-+    __m128 sigX02 = _mm_mul_ps(sig, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); // {sig[0]*sig[2], sig[1]*sig[2], sig[2]*sig[2], sig[3]*sig[2]}
-+    __m128 sigX12 = _mm_mul_ps(sigX01, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); // {sig[0]*sig[1]*sig[2], sig[1]*sig[1]*sig[2], sig[2]*sig[1]*sig[2], sig[3]*sig[1]*sig[2]}
-+    __m128 sigX = sigX01; // sig[0]*sig[1] now positioned at 0
++            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
++            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
++            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
++            b0x8a = av_clip_int16_avx(b0x8a);
 +
-+    sigX = _mm_insert_ps(sigX, sigX02, _MM_MK_INSERTPS_NDX(0, 1, 0)); // sig[0]*sig[2] at 1
-+    sigX = _mm_insert_ps(sigX, sigX02, _MM_MK_INSERTPS_NDX(1, 2, 0)); // sig[1]*sig[2] at 2
-+    sigX = _mm_insert_ps(sigX, sigX12, _MM_MK_INSERTPS_NDX(0, 3, 0)); // sig[0]*sig[1]*sig[2] at 3
++            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
++            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
++            b1x8a = av_clip_int16_avx(b1x8a);
 +
-+    mmr_idx = mmr_single ? 0 : (int)_mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 2, 0, 1)));
-+    order = (int)_mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(1, 2, 0, 3)));
++            r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
++            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
++            r0x8b = av_clip_int16_avx(r0x8b);
 +
-+    // dot first order
-+    mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 0*4]);
-+    ps = _mm_mul_ps(sig, mmr_coeffs);
-+    s += reduce_floatx4(ps);
-+    mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 1*4]);
-+    ps = _mm_mul_ps(sigX, mmr_coeffs);
-+    s += reduce_floatx4(ps);
++            r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
++            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
++            r1x8b = av_clip_int16_avx(r1x8b);
 +
-+    if (max_order >= 2 && (min_order >= 2 || order >= 2)) {
-+        __m128 sig2 = _mm_mul_ps(sig, sig);
-+        __m128 sigX2 = _mm_mul_ps(sigX, sigX);
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
++            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
++            g0x8b = av_clip_int16_avx(g0x8b);
 +
-+        mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 2*4]);
-+        ps = _mm_mul_ps(sig2, mmr_coeffs);
-+        s += reduce_floatx4(ps);
-+        mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 3*4]);
-+        ps = _mm_mul_ps(sigX2, mmr_coeffs);
-+        s += reduce_floatx4(ps);
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
++            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
++            g1x8b = av_clip_int16_avx(g1x8b);
 +
-+        if (max_order == 3 && (min_order == 3 || order >= 3)) {
-+            __m128 sig3 = _mm_mul_ps(sig2, sig);
-+            __m128 sigX3 = _mm_mul_ps(sigX2, sigX);
++            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
++            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
++            b0x8b = av_clip_int16_avx(b0x8b);
 +
-+            mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 4*4]);
-+            ps = _mm_mul_ps(sig3, mmr_coeffs);
-+            s += reduce_floatx4(ps);
-+            mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 5*4]);
-+            ps = _mm_mul_ps(sigX3, mmr_coeffs);
-+            s += reduce_floatx4(ps);
++            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
++            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
++            b1x8b = av_clip_int16_avx(b1x8b);
++
++            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++
++            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
++            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
++            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
++
++            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
++            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
++            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++
++            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
++            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
++            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++
++            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
++            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++            yoax8 = _mm256_slli_epi32(yoax8, out_sh2);
++
++            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
++            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++            yobx8 = _mm256_slli_epi32(yobx8, out_sh2);
++
++            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
++
++            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
++            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
++            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++
++            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
++            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
++            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++
++            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
++            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
++            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++
++            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
++            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++            y1oax8 = _mm256_slli_epi32(y1oax8, out_sh2);
++
++            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
++            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++            y1obx8 = _mm256_slli_epi32(y1obx8, out_sh2);
++
++            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
++
++            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
++            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
++            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++
++            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
++            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
++            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++
++            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
++            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
++            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++
++            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
++            uox8 = _mm256_srai_epi32(uox8, out_sh);
++            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++
++            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
++            vox8 = _mm256_srai_epi32(vox8, out_sh);
++            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++
++            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
++            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
++            uvoax8 = _mm256_slli_epi32(uvoax8, out_sh2);
++            uvobx8 = _mm256_slli_epi32(uvobx8, out_sh2);
++            uvox16 = _mm256_packus_epi32(uvoax8, uvobx8);
++            _mm256_storeu_si256((__m256i_u *) &dstuv[x], uvox16);
 +        }
 +    }
 +
-+    return s;
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff0;
++        rdsty += offset;
++        rdstuv += offset;
++        rsrcy += offset;
++        rsrcuv += offset;
++        tonemap_frame_p010_2_p010(rdsty, rdstuv,
++                                  rsrcy, rsrcuv,
++                                  dstlinesize, srclinesize,
++                                  dstdepth, srcdepth,
++                                  remainw, rheight, params);
++    }
++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
 +}
+Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
+===================================================================
+--- /dev/null
++++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
+@@ -0,0 +1,75 @@
++/*
++ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c)))
-+X86_64_V2 inline static __m128 reshape_dovi_iptpqc2(__m128 sig, const TonemapIntParams *ctx)
-+{
-+    int has_mmr_poly;
-+    float s;
++#ifndef AVFILTER_X86_TONEMAPX_INTRIN_AVX_H
++#define AVFILTER_X86_TONEMAPX_INTRIN_AVX_H
++
++#include "libavfilter/vf_tonemapx.h"
++
++X86_64_V3 void tonemap_frame_dovi_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                             const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                             const int *dstlinesize, const int *srclinesize,
++                                             int dstdepth, int srcdepth,
++                                             int width, int height,
++                                             const struct TonemapIntParams *params);
++
++X86_64_V3 void tonemap_frame_dovi_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                               const int *dstlinesize, const int *srclinesize,
++                                               int dstdepth, int srcdepth,
++                                               int width, int height,
++                                               const struct TonemapIntParams *params);
++
++X86_64_V3 void tonemap_frame_dovi_2_420hdr_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                               const int *dstlinesize, const int *srclinesize,
++                                               int dstdepth, int srcdepth,
++                                               int width, int height,
++                                               const struct TonemapIntParams *params);
++
++X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                               const int *dstlinesize, const int *srclinesize,
++                                               int dstdepth, int srcdepth,
++                                               int width, int height,
++                                               const struct TonemapIntParams *params);
++
++X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                                 const int *dstlinesize, const int *srclinesize,
++                                                 int dstdepth, int srcdepth,
++                                                 int width, int height,
++                                                 const struct TonemapIntParams *params);
++
++X86_64_V3 void tonemap_frame_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv,
++                                             const uint16_t *srcy, const uint16_t *srcuv,
++                                             const int *dstlinesize, const int *srclinesize,
++                                             int dstdepth, int srcdepth,
++                                             int width, int height,
++                                             const struct TonemapIntParams *params);
++
++X86_64_V3 void tonemap_frame_p010_2_p010_avx(uint16_t *dsty, uint16_t *dstuv,
++                                             const uint16_t *srcy, const uint16_t *srcuv,
++                                             const int *dstlinesize, const int *srclinesize,
++                                             int dstdepth, int srcdepth,
++                                             int width, int height,
++                                             const struct TonemapIntParams *params);
++
++#endif // AVFILTER_X86_TONEMAPX_INTRIN_AVX_H
+Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
+===================================================================
+--- /dev/null
++++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
+@@ -0,0 +1,2740 @@
++/*
++ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "vf_tonemapx_intrin_sse.h"
++
++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
++#    include <immintrin.h>
++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
++
++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
++// GCC 10 and below does not implement _mm_storeu_si32 with movd instruction
++// cast the register into float register and store with movss as a workaround
++#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
++__attribute__((always_inline))
++X86_64_V2 static inline void _mm_storeu_si32(void* mem_addr, __m128i a) {
++    _mm_store_ss((float*)mem_addr, _mm_castsi128_ps(a));
++    return;
++}
++#endif
++
++X86_64_V2 static inline __m128i av_clip_uint16_sse(__m128i a)
++{
++    __m128i mask = _mm_set1_epi32(0x7FFF);
++    __m128i condition = _mm_and_si128(a, _mm_set1_epi32(~0x7FFF));
++
++    __m128i zero = _mm_setzero_si128();
++    __m128i cmp = _mm_cmpeq_epi32(condition, zero);
++
++    __m128i neg_a = _mm_and_si128(_mm_srai_epi32(_mm_xor_si128(a, _mm_set1_epi32(-1)), 31), mask);
++    __m128i result = _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, neg_a));
++
++    return result;
++}
++
++X86_64_V2 static inline __m128i av_clip_int16_sse(__m128i a)
++{
++    __m128i add_result = _mm_add_epi32(a, _mm_set1_epi32(0x8000U));
++    __m128i mask = _mm_set1_epi32(~0xFFFF);
++    __m128i condition = _mm_and_si128(add_result, mask);
++    __m128i cmp = _mm_cmpeq_epi32(condition, _mm_setzero_si128());
++
++    __m128i shifted = _mm_srai_epi32(a, 31);
++    __m128i xor_result = _mm_xor_si128(shifted, _mm_set1_epi32(0x7FFF));
++
++    return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, xor_result));
++}
++
++X86_64_V2 inline static __m128 mix_float32x4(__m128 x, __m128 y, __m128 a)
++{
++    __m128 n = _mm_sub_ps(y, x);
++    n = _mm_mul_ps(n, a);
++    n = _mm_add_ps(n, x);
++    return n;
++}
++
++X86_64_V2 inline static float reduce_floatx4(__m128 x) {
++    x = _mm_hadd_ps(x, x);
++    x = _mm_hadd_ps(x, x);
++    return _mm_cvtss_f32(x);
++}
++
++X86_64_V2 static inline float reshape_poly(float s, __m128 coeffs)
++{
++    __m128 ps = _mm_set_ps(0.0f, s * s, s, 1.0f);
++    ps = _mm_mul_ps(ps, coeffs);
++    return reduce_floatx4(ps);
++}
++
++X86_64_V2 inline static float reshape_mmr(__m128 sig, __m128 coeffs, const float* mmr,
++                                          int mmr_single, int min_order, int max_order)
++{
++    float s = _mm_cvtss_f32(coeffs);
++    int mmr_idx = 0;
++    int order = 0;
++
++    __m128 mmr_coeffs, ps;
++    __m128 sigX01 = _mm_mul_ps(sig, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(1, 1, 1, 1))); // {sig[0]*sig[1], sig[1]*sig[1], sig[2]*sig[1], sig[3]*sig[1]}
++    __m128 sigX02 = _mm_mul_ps(sig, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); // {sig[0]*sig[2], sig[1]*sig[2], sig[2]*sig[2], sig[3]*sig[2]}
++    __m128 sigX12 = _mm_mul_ps(sigX01, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); // {sig[0]*sig[1]*sig[2], sig[1]*sig[1]*sig[2], sig[2]*sig[1]*sig[2], sig[3]*sig[1]*sig[2]}
++    __m128 sigX = sigX01; // sig[0]*sig[1] now positioned at 0
++
++    sigX = _mm_insert_ps(sigX, sigX02, _MM_MK_INSERTPS_NDX(0, 1, 0)); // sig[0]*sig[2] at 1
++    sigX = _mm_insert_ps(sigX, sigX02, _MM_MK_INSERTPS_NDX(1, 2, 0)); // sig[1]*sig[2] at 2
++    sigX = _mm_insert_ps(sigX, sigX12, _MM_MK_INSERTPS_NDX(0, 3, 0)); // sig[0]*sig[1]*sig[2] at 3
++
++    mmr_idx = mmr_single ? 0 : (int)_mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 2, 0, 1)));
++    order = (int)_mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(1, 2, 0, 3)));
++
++    // dot first order
++    mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 0*4]);
++    ps = _mm_mul_ps(sig, mmr_coeffs);
++    s += reduce_floatx4(ps);
++    mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 1*4]);
++    ps = _mm_mul_ps(sigX, mmr_coeffs);
++    s += reduce_floatx4(ps);
++
++    if (max_order >= 2 && (min_order >= 2 || order >= 2)) {
++        __m128 sig2 = _mm_mul_ps(sig, sig);
++        __m128 sigX2 = _mm_mul_ps(sigX, sigX);
++
++        mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 2*4]);
++        ps = _mm_mul_ps(sig2, mmr_coeffs);
++        s += reduce_floatx4(ps);
++        mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 3*4]);
++        ps = _mm_mul_ps(sigX2, mmr_coeffs);
++        s += reduce_floatx4(ps);
++
++        if (max_order == 3 && (min_order == 3 || order >= 3)) {
++            __m128 sig3 = _mm_mul_ps(sig2, sig);
++            __m128 sigX3 = _mm_mul_ps(sigX2, sigX);
++
++            mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 4*4]);
++            ps = _mm_mul_ps(sig3, mmr_coeffs);
++            s += reduce_floatx4(ps);
++            mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 5*4]);
++            ps = _mm_mul_ps(sigX3, mmr_coeffs);
++            s += reduce_floatx4(ps);
++        }
++    }
++
++    return s;
++}
++
++#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c)))
++X86_64_V2 inline static __m128 reshape_dovi_iptpqc2(__m128 sig, const TonemapIntParams *ctx)
++{
++    int has_mmr_poly;
++    float s;
 +
 +    float *src_dovi_params = ctx->dovi_pbuf;
 +    float *src_dovi_pivots = ctx->dovi_pbuf + 24;
 +    float *src_dovi_coeffs = ctx->dovi_pbuf + 48; //float4*
 +    float *src_dovi_mmr = ctx->dovi_pbuf + 144; //float4*
 +
-+    float* dovi_params_i = src_dovi_params + 0*8;
-+    float* dovi_pivots_i = src_dovi_pivots + 0*8;
-+    float* dovi_coeffs_i = src_dovi_coeffs + 0 * 8 * 4; //float4*
-+    float* dovi_mmr_i = src_dovi_mmr + 0 * 48 * 4; //float4*
-+    int dovi_num_pivots_i = dovi_params_i[0];
-+    int dovi_has_mmr_i = dovi_params_i[1];
-+    int dovi_has_poly_i = dovi_params_i[2];
-+    int dovi_mmr_single_i = dovi_params_i[3];
-+    int dovi_min_order_i = dovi_params_i[4];
-+    int dovi_max_order_i = dovi_params_i[5];
-+    float dovi_lo_i = dovi_params_i[6];
-+    float dovi_hi_i = dovi_params_i[7];
++    float* dovi_params_i = src_dovi_params + 0*8;
++    float* dovi_pivots_i = src_dovi_pivots + 0*8;
++    float* dovi_coeffs_i = src_dovi_coeffs + 0 * 8 * 4; //float4*
++    float* dovi_mmr_i = src_dovi_mmr + 0 * 48 * 4; //float4*
++    int dovi_num_pivots_i = dovi_params_i[0];
++    int dovi_has_mmr_i = dovi_params_i[1];
++    int dovi_has_poly_i = dovi_params_i[2];
++    int dovi_mmr_single_i = dovi_params_i[3];
++    int dovi_min_order_i = dovi_params_i[4];
++    int dovi_max_order_i = dovi_params_i[5];
++    float dovi_lo_i = dovi_params_i[6];
++    float dovi_hi_i = dovi_params_i[7];
++
++    float* dovi_params_p = src_dovi_params + 1*8;
++    float* dovi_coeffs_p = src_dovi_coeffs + 1*8 * 4; //float4*
++    float* dovi_mmr_p = src_dovi_mmr + 1*48 * 4; //float4*
++    int dovi_has_mmr_p = dovi_params_p[1];
++    int dovi_has_poly_p = dovi_params_p[2];
++    int dovi_mmr_single_p = dovi_params_p[3];
++    int dovi_min_order_p = dovi_params_p[4];
++    int dovi_max_order_p = dovi_params_p[5];
++    float dovi_lo_p = dovi_params_p[6];
++    float dovi_hi_p = dovi_params_p[7];
++
++    float* dovi_params_t = src_dovi_params + 2*8;
++    float* dovi_coeffs_t = src_dovi_coeffs + 2*8 * 4; //float4*
++    float* dovi_mmr_t = src_dovi_mmr + 2*48 * 4; //float4*
++    int dovi_has_mmr_t = dovi_params_t[1];
++    int dovi_has_poly_t = dovi_params_t[2];
++    int dovi_mmr_single_t = dovi_params_t[3];
++    int dovi_min_order_t = dovi_params_t[4];
++    int dovi_max_order_t = dovi_params_t[5];
++    float dovi_lo_t = dovi_params_t[6];
++    float dovi_hi_t = dovi_params_t[7];
++
++    __m128 coeffs, result;
++
++    // reshape I
++    s = _mm_cvtss_f32(sig);
++    result = sig;
++    if (dovi_num_pivots_i > 2) {
++        __m128 m01 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i), _mm_loadu_ps(dovi_coeffs_i + 4), _mm_set1_ps(s >= dovi_pivots_i[0]));
++        __m128 m23 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 2*4), _mm_loadu_ps(dovi_coeffs_i + 3*4), _mm_set1_ps(s >= dovi_pivots_i[2]));
++        __m128 m0123 = mix_float32x4(m01, m23, _mm_set1_ps(s >= dovi_pivots_i[1]));
++        __m128 m45 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 4*4), _mm_loadu_ps(dovi_coeffs_i + 5*4), _mm_set1_ps(s >= dovi_pivots_i[4]));
++        __m128 m67 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 6*4), _mm_loadu_ps(dovi_coeffs_i + 7*4), _mm_set1_ps(s >= dovi_pivots_i[6]));
++        __m128 m4567 = mix_float32x4(m45, m67, _mm_set1_ps(s >= dovi_pivots_i[5]));
++        coeffs = mix_float32x4(m0123, m4567, _mm_set1_ps(s >= dovi_pivots_i[3]));
++    } else {
++        coeffs = _mm_loadu_ps(dovi_coeffs_i);
++    }
++
++    has_mmr_poly = dovi_has_mmr_i && dovi_has_poly_i;
++
++    if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_i))
++        s = reshape_poly(s, coeffs);
++    else
++        s = reshape_mmr(result, coeffs, dovi_mmr_i,
++                        dovi_mmr_single_i, dovi_min_order_i, dovi_max_order_i);
++
++    result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_i, dovi_hi_i)), _MM_MK_INSERTPS_NDX(0, 0, 0));
++
++    // reshape P
++    s = _mm_cvtss_f32(_mm_shuffle_ps(sig, sig, _MM_SHUFFLE(1, 1, 1, 1)));
++    coeffs = _mm_loadu_ps(dovi_coeffs_p);
++    has_mmr_poly = dovi_has_mmr_p && dovi_has_poly_p;
++
++    if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_p))
++        s = reshape_poly(s, coeffs);
++    else
++        s = reshape_mmr(result, coeffs, dovi_mmr_p,
++                        dovi_mmr_single_p, dovi_min_order_p, dovi_max_order_p);
++
++    result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_p, dovi_hi_p)), _MM_MK_INSERTPS_NDX(0, 1, 0));
++
++    // reshape T
++    s = _mm_cvtss_f32(_mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2)));
++    coeffs = _mm_loadu_ps(dovi_coeffs_t);
++    has_mmr_poly = dovi_has_mmr_t && dovi_has_poly_t;
++
++    if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_t))
++        s = reshape_poly(s, coeffs);
++    else
++        s = reshape_mmr(result, coeffs, dovi_mmr_t,
++                        dovi_mmr_single_t, dovi_min_order_t, dovi_max_order_t);
++
++    result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_t, dovi_hi_t)), _MM_MK_INSERTPS_NDX(0, 2, 0));
++
++    return result;
++}
++
++X86_64_V2 inline static void ycc2rgbx4(__m128* dy, __m128* dcb, __m128* dcr,
++                                       __m128 y, __m128 cb, __m128 cr,
++                                       const double nonlinear[3][3], const float ycc_offset[3])
++{
++    *dy = _mm_mul_ps(y, _mm_set1_ps((float)nonlinear[0][0]));
++    *dy = _mm_add_ps(*dy, _mm_mul_ps(cb, _mm_set1_ps((float)nonlinear[0][1])));
++    *dy = _mm_add_ps(*dy, _mm_mul_ps(cr, _mm_set1_ps((float)nonlinear[0][2])));
++    *dy = _mm_sub_ps(*dy, _mm_set1_ps(ycc_offset[0]));
++
++    *dcb = _mm_mul_ps(y, _mm_set1_ps((float)nonlinear[1][0]));
++    *dcb = _mm_add_ps(*dcb, _mm_mul_ps(cb, _mm_set1_ps((float)nonlinear[1][1])));
++    *dcb = _mm_add_ps(*dcb, _mm_mul_ps(cr, _mm_set1_ps((float)nonlinear[1][2])));
++    *dcb = _mm_sub_ps(*dcb, _mm_set1_ps(ycc_offset[1]));
++
++    *dcr = _mm_mul_ps(y, _mm_set1_ps((float)nonlinear[2][0]));
++    *dcr = _mm_add_ps(*dcr, _mm_mul_ps(cb, _mm_set1_ps((float)nonlinear[2][1])));
++    *dcr = _mm_add_ps(*dcr, _mm_mul_ps(cr, _mm_set1_ps((float)nonlinear[2][2])));
++    *dcr = _mm_sub_ps(*dcr, _mm_set1_ps(ycc_offset[2]));
++}
++
++X86_64_V2 inline static void lms2rgbx4(__m128* dl, __m128* dm, __m128* ds,
++                                       __m128 l, __m128 m, __m128 s,
++                                       const double lms2rgb_matrix[3][3])
++{
++    *dl = _mm_mul_ps(l, _mm_set1_ps((float)lms2rgb_matrix[0][0]));
++    *dl = _mm_add_ps(*dl, _mm_mul_ps(m, _mm_set1_ps((float)lms2rgb_matrix[0][1])));
++    *dl = _mm_add_ps(*dl, _mm_mul_ps(s, _mm_set1_ps((float)lms2rgb_matrix[0][2])));
++
++    *dm = _mm_mul_ps(l, _mm_set1_ps((float)lms2rgb_matrix[1][0]));
++    *dm = _mm_add_ps(*dm, _mm_mul_ps(m, _mm_set1_ps((float)lms2rgb_matrix[1][1])));
++    *dm = _mm_add_ps(*dm, _mm_mul_ps(s, _mm_set1_ps((float)lms2rgb_matrix[1][2])));
++
++    *ds = _mm_mul_ps(l, _mm_set1_ps((float)lms2rgb_matrix[2][0]));
++    *ds = _mm_add_ps(*ds, _mm_mul_ps(m, _mm_set1_ps((float)lms2rgb_matrix[2][1])));
++    *ds = _mm_add_ps(*ds, _mm_mul_ps(s, _mm_set1_ps((float)lms2rgb_matrix[2][2])));
++}
++
++X86_64_V2 static inline void tonemap_int32x4_sse(__m128i r_in, __m128i g_in, __m128i b_in,
++                                                 int16_t *r_out, int16_t *g_out, int16_t *b_out,
++                                                 float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
++                                                 const AVLumaCoefficients *coeffs,
++                                                 const AVLumaCoefficients *ocoeffs, double desat,
++                                                 double (*rgb2rgb)[3][3],
++                                                 int rgb2rgb_passthrough)
++{
++    __m128i sig4;
++    __m128 mapvalx4, r_linx4, g_linx4, b_linx4;
++    __m128 offset = _mm_set1_ps(0.5f);
++    __m128 intermediate_upper_bound = _mm_set1_ps(JPEG_SCALE);
++    __m128i r, g, b, rx4, gx4, bx4;
++
++    float mapval4[4], r_lin4[4], g_lin4[4], b_lin4[4];
++
++    r = av_clip_uint16_sse(r_in);
++    g = av_clip_uint16_sse(g_in);
++    b = av_clip_uint16_sse(b_in);
++
++    sig4 = _mm_max_epi32(r, _mm_max_epi32(g, b));
++
++    // Cannot use loop here as the lane has to be compile-time constant
++#define LOAD_LUT(i) mapval4[i] = tonemap_lut[_mm_extract_epi32(sig4, i)]; \
++r_lin4[i] = lin_lut[_mm_extract_epi32(r, i)];                             \
++g_lin4[i] = lin_lut[_mm_extract_epi32(g, i)];                             \
++b_lin4[i] = lin_lut[_mm_extract_epi32(b, i)];
++
++    LOAD_LUT(0)
++    LOAD_LUT(1)
++    LOAD_LUT(2)
++    LOAD_LUT(3)
++
++#undef LOAD_LUT
++
++    mapvalx4 = _mm_loadu_ps(mapval4);
++    r_linx4 = _mm_loadu_ps(r_lin4);
++    g_linx4 = _mm_loadu_ps(g_lin4);
++    b_linx4 = _mm_loadu_ps(b_lin4);
++
++    if (!rgb2rgb_passthrough) {
++        r_linx4 = _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][0]));
++        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][1])));
++        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][2])));
++
++        g_linx4 = _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][1]));
++        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][0])));
++        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][2])));
++
++        b_linx4 = _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][2]));
++        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][0])));
++        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][1])));
++    }
++
++    if (desat > 0) {
++        __m128 eps_x4 = _mm_set1_ps(FLOAT_EPS);
++        __m128 desat4 = _mm_set1_ps((float)desat);
++        __m128 luma4 = _mm_set1_ps(0);
++        __m128 overbright4;
++
++        luma4 = _mm_add_ps(luma4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)av_q2d(coeffs->cr))));
++        luma4 = _mm_add_ps(luma4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)av_q2d(coeffs->cg))));
++        luma4 = _mm_add_ps(luma4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)av_q2d(coeffs->cb))));
++        overbright4 = _mm_div_ps(_mm_max_ps(_mm_sub_ps(luma4, desat4), eps_x4), _mm_max_ps(luma4, eps_x4));
++        r_linx4 = _mm_sub_ps(r_linx4, _mm_mul_ps(r_linx4, overbright4));
++        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(luma4, overbright4));
++        g_linx4 = _mm_sub_ps(g_linx4, _mm_mul_ps(g_linx4, overbright4));
++        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(luma4, overbright4));
++        b_linx4 = _mm_sub_ps(b_linx4, _mm_mul_ps(b_linx4, overbright4));
++        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(luma4, overbright4));
++    }
++
++    r_linx4 = _mm_mul_ps(r_linx4, mapvalx4);
++    g_linx4 = _mm_mul_ps(g_linx4, mapvalx4);
++    b_linx4 = _mm_mul_ps(b_linx4, mapvalx4);
++
++    r_linx4 = _mm_mul_ps(r_linx4, intermediate_upper_bound);
++    r_linx4 = _mm_add_ps(r_linx4, offset);
++
++    g_linx4 = _mm_mul_ps(g_linx4, intermediate_upper_bound);
++    g_linx4 = _mm_add_ps(g_linx4, offset);
++
++    b_linx4 = _mm_mul_ps(b_linx4, intermediate_upper_bound);
++    b_linx4 = _mm_add_ps(b_linx4, offset);
++
++    rx4 = _mm_cvttps_epi32(r_linx4);
++    rx4 = av_clip_uint16_sse(rx4);
++    gx4 = _mm_cvttps_epi32(g_linx4);
++    gx4 = av_clip_uint16_sse(gx4);
++    bx4 = _mm_cvttps_epi32(b_linx4);
++    bx4 = av_clip_uint16_sse(bx4);
++
++#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm_extract_epi32(rx4, i)]; \
++g_out[i] = delin_lut[_mm_extract_epi32(gx4, i)];                       \
++b_out[i] = delin_lut[_mm_extract_epi32(bx4, i)];
++
++    SAVE_COLOR(0)
++    SAVE_COLOR(1)
++    SAVE_COLOR(2)
++    SAVE_COLOR(3)
++
++#undef SAVE_COLOR
++}
++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
++
++X86_64_V2 void tonemap_frame_dovi_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                             const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                             const int *dstlinesize, const int *srclinesize,
++                                             int dstdepth, int srcdepth,
++                                             int width, int height,
++                                             const struct TonemapIntParams *params)
++{
++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstu = dstu;
++    uint8_t *rdstv = dstv;
++
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
++
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 6;
++
++    const int in_depth = srcdepth;
++    const float in_rng = (float)((1 << in_depth) - 1);
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
++
++    __m128i zero128 = _mm_setzero_si128();
++    __m128i ux4, vx4;
++    __m128i y0x8, y1x8;
++    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
++    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
++    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
++
++    __m128i r0ox8, g0ox8, b0ox8;
++    __m128i y0ox8;
++    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
++    __m128i yoax4, yobx4;
++
++    __m128i r1ox8, g1ox8, b1ox8;
++    __m128i y1ox8;
++    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    __m128i y1oax4, y1obx4;
++    __m128i uox4, vox4, ravgx4, gavgx4, bavgx4;
++
++    __m128 ipt0, ipt1, ipt2, ipt3;
++    __m128 ia1, ib1, ia2, ib2;
++    __m128 ix4, px4, tx4;
++    __m128 lx4, mx4, sx4;
++    __m128 rx4a, gx4a, bx4a, rx4b, gx4b, bx4b;
++    __m128 y0x4af, y0x4bf, y1x4af, y1x4bf, ux4af, ux4bf, vx4af, vx4bf;
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
++
++            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
++            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
++            ux4 = _mm_loadu_si64((__m128i*)(srcu + (x >> 1)));
++            vx4 = _mm_loadu_si64((__m128i*)(srcv + (x >> 1)));
++
++            y0x4a = _mm_cvtepu16_epi32(y0x8);
++            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
++            y1x4a = _mm_cvtepu16_epi32(y1x8);
++            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
++            ux4 = _mm_cvtepu16_epi32(ux4);
++            vx4 = _mm_cvtepu16_epi32(vx4);
++
++            ux4a = _mm_unpacklo_epi32(ux4, ux4);
++            ux4b = _mm_unpackhi_epi32(ux4, ux4);
++            vx4a = _mm_unpacklo_epi32(vx4, vx4);
++            vx4b = _mm_unpackhi_epi32(vx4, vx4);
++
++            y0x4af = _mm_cvtepi32_ps(y0x4a);
++            y0x4bf = _mm_cvtepi32_ps(y0x4b);
++            y1x4af = _mm_cvtepi32_ps(y1x4a);
++            y1x4bf = _mm_cvtepi32_ps(y1x4b);
++            ux4af = _mm_cvtepi32_ps(ux4a);
++            ux4bf = _mm_cvtepi32_ps(ux4b);
++            vx4af = _mm_cvtepi32_ps(vx4a);
++            vx4bf = _mm_cvtepi32_ps(vx4b);
++
++            y0x4af = _mm_div_ps(y0x4af, _mm_set1_ps(in_rng));
++            y0x4bf = _mm_div_ps(y0x4bf, _mm_set1_ps(in_rng));
++            y1x4af = _mm_div_ps(y1x4af, _mm_set1_ps(in_rng));
++            y1x4bf = _mm_div_ps(y1x4bf, _mm_set1_ps(in_rng));
++            ux4af = _mm_div_ps(ux4af, _mm_set1_ps(in_rng));
++            ux4bf = _mm_div_ps(ux4bf, _mm_set1_ps(in_rng));
++            vx4af = _mm_div_ps(vx4af, _mm_set1_ps(in_rng));
++            vx4bf = _mm_div_ps(vx4bf, _mm_set1_ps(in_rng));
++
++            // Reshape y0x4a
++            ia1 = _mm_unpacklo_ps(y0x4af, ux4af);
++            ia2 = _mm_unpackhi_ps(y0x4af, ux4af);
++            ib1 = _mm_unpacklo_ps(vx4af, _mm_setzero_ps());
++            ib2 = _mm_unpackhi_ps(vx4af, _mm_setzero_ps());
++            ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2));
++            ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0));
++
++            ia1 = _mm_unpacklo_ps(ipt0, ipt1);
++            ia2 = _mm_unpacklo_ps(ipt2, ipt3);
++            ib1 = _mm_unpackhi_ps(ipt0, ipt1);
++            ib2 = _mm_unpackhi_ps(ipt2, ipt3);
++
++            ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0));
++            px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(JPEG_SCALE));
++            gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(JPEG_SCALE));
++            bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(JPEG_SCALE));
++
++            r0x4a = _mm_cvtps_epi32(rx4a);
++            r0x4a = av_clip_int16_sse(r0x4a);
++            g0x4a = _mm_cvtps_epi32(gx4a);
++            g0x4a = av_clip_int16_sse(g0x4a);
++            b0x4a = _mm_cvtps_epi32(bx4a);
++            b0x4a = av_clip_int16_sse(b0x4a);
++
++            // Reshape y1x4a
++            ia1 = _mm_unpacklo_ps(y1x4af, ux4af);
++            ia2 = _mm_unpackhi_ps(y1x4af, ux4af);
++            ib1 = _mm_unpacklo_ps(vx4af, _mm_setzero_ps());
++            ib2 = _mm_unpackhi_ps(vx4af, _mm_setzero_ps());
++            ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2));
++            ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2));
 +
-+    float* dovi_params_p = src_dovi_params + 1*8;
-+    float* dovi_coeffs_p = src_dovi_coeffs + 1*8 * 4; //float4*
-+    float* dovi_mmr_p = src_dovi_mmr + 1*48 * 4; //float4*
-+    int dovi_has_mmr_p = dovi_params_p[1];
-+    int dovi_has_poly_p = dovi_params_p[2];
-+    int dovi_mmr_single_p = dovi_params_p[3];
-+    int dovi_min_order_p = dovi_params_p[4];
-+    int dovi_max_order_p = dovi_params_p[5];
-+    float dovi_lo_p = dovi_params_p[6];
-+    float dovi_hi_p = dovi_params_p[7];
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
 +
-+    float* dovi_params_t = src_dovi_params + 2*8;
-+    float* dovi_coeffs_t = src_dovi_coeffs + 2*8 * 4; //float4*
-+    float* dovi_mmr_t = src_dovi_mmr + 2*48 * 4; //float4*
-+    int dovi_has_mmr_t = dovi_params_t[1];
-+    int dovi_has_poly_t = dovi_params_t[2];
-+    int dovi_mmr_single_t = dovi_params_t[3];
-+    int dovi_min_order_t = dovi_params_t[4];
-+    int dovi_max_order_t = dovi_params_t[5];
-+    float dovi_lo_t = dovi_params_t[6];
-+    float dovi_hi_t = dovi_params_t[7];
++            ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0));
 +
-+    __m128 coeffs, result;
++            ia1 = _mm_unpacklo_ps(ipt0, ipt1);
++            ia2 = _mm_unpacklo_ps(ipt2, ipt3);
++            ib1 = _mm_unpackhi_ps(ipt0, ipt1);
++            ib2 = _mm_unpackhi_ps(ipt2, ipt3);
 +
-+    // reshape I
-+    s = _mm_cvtss_f32(sig);
-+    result = sig;
-+    if (dovi_num_pivots_i > 2) {
-+        __m128 m01 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i), _mm_loadu_ps(dovi_coeffs_i + 4), _mm_set1_ps(s >= dovi_pivots_i[0]));
-+        __m128 m23 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 2*4), _mm_loadu_ps(dovi_coeffs_i + 3*4), _mm_set1_ps(s >= dovi_pivots_i[2]));
-+        __m128 m0123 = mix_float32x4(m01, m23, _mm_set1_ps(s >= dovi_pivots_i[1]));
-+        __m128 m45 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 4*4), _mm_loadu_ps(dovi_coeffs_i + 5*4), _mm_set1_ps(s >= dovi_pivots_i[4]));
-+        __m128 m67 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 6*4), _mm_loadu_ps(dovi_coeffs_i + 7*4), _mm_set1_ps(s >= dovi_pivots_i[6]));
-+        __m128 m4567 = mix_float32x4(m45, m67, _mm_set1_ps(s >= dovi_pivots_i[5]));
-+        coeffs = mix_float32x4(m0123, m4567, _mm_set1_ps(s >= dovi_pivots_i[3]));
-+    } else {
-+        coeffs = _mm_loadu_ps(dovi_coeffs_i);
-+    }
++            ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0));
++            px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2));
 +
-+    has_mmr_poly = dovi_has_mmr_i && dovi_has_poly_i;
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix);
 +
-+    if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_i))
-+        s = reshape_poly(s, coeffs);
-+    else
-+        s = reshape_mmr(result, coeffs, dovi_mmr_i,
-+                        dovi_mmr_single_i, dovi_min_order_i, dovi_max_order_i);
++            rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(JPEG_SCALE));
++            gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(JPEG_SCALE));
++            bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(JPEG_SCALE));
 +
-+    result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_i, dovi_hi_i)), _MM_MK_INSERTPS_NDX(0, 0, 0));
++            r1x4a = _mm_cvtps_epi32(rx4a);
++            r1x4a = av_clip_int16_sse(r1x4a);
++            g1x4a = _mm_cvtps_epi32(gx4a);
++            g1x4a = av_clip_int16_sse(g1x4a);
++            b1x4a = _mm_cvtps_epi32(bx4a);
++            b1x4a = av_clip_int16_sse(b1x4a);
 +
-+    // reshape P
-+    s = _mm_cvtss_f32(_mm_shuffle_ps(sig, sig, _MM_SHUFFLE(1, 1, 1, 1)));
-+    coeffs = _mm_loadu_ps(dovi_coeffs_p);
-+    has_mmr_poly = dovi_has_mmr_p && dovi_has_poly_p;
++            // Reshape y0x4b
++            ia1 = _mm_unpacklo_ps(y0x4bf, ux4bf);
++            ia2 = _mm_unpackhi_ps(y0x4bf, ux4bf);
++            ib1 = _mm_unpacklo_ps(vx4bf, _mm_setzero_ps());
++            ib2 = _mm_unpackhi_ps(vx4bf, _mm_setzero_ps());
++            ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2));
++            ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2));
 +
-+    if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_p))
-+        s = reshape_poly(s, coeffs);
-+    else
-+        s = reshape_mmr(result, coeffs, dovi_mmr_p,
-+                        dovi_mmr_single_p, dovi_min_order_p, dovi_max_order_p);
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
 +
-+    result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_p, dovi_hi_p)), _MM_MK_INSERTPS_NDX(0, 1, 0));
++            ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0));
 +
-+    // reshape T
-+    s = _mm_cvtss_f32(_mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2)));
-+    coeffs = _mm_loadu_ps(dovi_coeffs_t);
-+    has_mmr_poly = dovi_has_mmr_t && dovi_has_poly_t;
++            ia1 = _mm_unpacklo_ps(ipt0, ipt1);
++            ia2 = _mm_unpacklo_ps(ipt2, ipt3);
++            ib1 = _mm_unpackhi_ps(ipt0, ipt1);
++            ib2 = _mm_unpackhi_ps(ipt2, ipt3);
 +
-+    if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_t))
-+        s = reshape_poly(s, coeffs);
-+    else
-+        s = reshape_mmr(result, coeffs, dovi_mmr_t,
-+                        dovi_mmr_single_t, dovi_min_order_t, dovi_max_order_t);
++            ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0));
++            px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2));
 +
-+    result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_t, dovi_hi_t)), _MM_MK_INSERTPS_NDX(0, 2, 0));
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix);
 +
-+    return result;
-+}
++            rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(JPEG_SCALE));
++            gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(JPEG_SCALE));
++            bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(JPEG_SCALE));
 +
-+X86_64_V2 inline static void ycc2rgbx4(__m128* dy, __m128* dcb, __m128* dcr,
-+                                       __m128 y, __m128 cb, __m128 cr,
-+                                       const double nonlinear[3][3], const float ycc_offset[3])
-+{
-+    *dy = _mm_mul_ps(y, _mm_set1_ps((float)nonlinear[0][0]));
-+    *dy = _mm_add_ps(*dy, _mm_mul_ps(cb, _mm_set1_ps((float)nonlinear[0][1])));
-+    *dy = _mm_add_ps(*dy, _mm_mul_ps(cr, _mm_set1_ps((float)nonlinear[0][2])));
-+    *dy = _mm_sub_ps(*dy, _mm_set1_ps(ycc_offset[0]));
++            r0x4b = _mm_cvtps_epi32(rx4b);
++            r0x4b = av_clip_int16_sse(r0x4b);
++            g0x4b = _mm_cvtps_epi32(gx4b);
++            g0x4b = av_clip_int16_sse(g0x4b);
++            b0x4b = _mm_cvtps_epi32(bx4b);
++            b0x4b = av_clip_int16_sse(b0x4b);
 +
-+    *dcb = _mm_mul_ps(y, _mm_set1_ps((float)nonlinear[1][0]));
-+    *dcb = _mm_add_ps(*dcb, _mm_mul_ps(cb, _mm_set1_ps((float)nonlinear[1][1])));
-+    *dcb = _mm_add_ps(*dcb, _mm_mul_ps(cr, _mm_set1_ps((float)nonlinear[1][2])));
-+    *dcb = _mm_sub_ps(*dcb, _mm_set1_ps(ycc_offset[1]));
++            // Reshape y1x4b
++            ia1 = _mm_unpacklo_ps(y1x4bf, ux4bf);
++            ia2 = _mm_unpackhi_ps(y1x4bf, ux4bf);
++            ib1 = _mm_unpacklo_ps(vx4bf, _mm_setzero_ps());
++            ib2 = _mm_unpackhi_ps(vx4bf, _mm_setzero_ps());
++            ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2));
++            ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2));
 +
-+    *dcr = _mm_mul_ps(y, _mm_set1_ps((float)nonlinear[2][0]));
-+    *dcr = _mm_add_ps(*dcr, _mm_mul_ps(cb, _mm_set1_ps((float)nonlinear[2][1])));
-+    *dcr = _mm_add_ps(*dcr, _mm_mul_ps(cr, _mm_set1_ps((float)nonlinear[2][2])));
-+    *dcr = _mm_sub_ps(*dcr, _mm_set1_ps(ycc_offset[2]));
-+}
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
 +
-+X86_64_V2 inline static void lms2rgbx4(__m128* dl, __m128* dm, __m128* ds,
-+                                       __m128 l, __m128 m, __m128 s,
-+                                       const double lms2rgb_matrix[3][3])
-+{
-+    *dl = _mm_mul_ps(l, _mm_set1_ps((float)lms2rgb_matrix[0][0]));
-+    *dl = _mm_add_ps(*dl, _mm_mul_ps(m, _mm_set1_ps((float)lms2rgb_matrix[0][1])));
-+    *dl = _mm_add_ps(*dl, _mm_mul_ps(s, _mm_set1_ps((float)lms2rgb_matrix[0][2])));
++            ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0));
 +
-+    *dm = _mm_mul_ps(l, _mm_set1_ps((float)lms2rgb_matrix[1][0]));
-+    *dm = _mm_add_ps(*dm, _mm_mul_ps(m, _mm_set1_ps((float)lms2rgb_matrix[1][1])));
-+    *dm = _mm_add_ps(*dm, _mm_mul_ps(s, _mm_set1_ps((float)lms2rgb_matrix[1][2])));
++            ia1 = _mm_unpacklo_ps(ipt0, ipt1);
++            ia2 = _mm_unpacklo_ps(ipt2, ipt3);
++            ib1 = _mm_unpackhi_ps(ipt0, ipt1);
++            ib2 = _mm_unpackhi_ps(ipt2, ipt3);
 +
-+    *ds = _mm_mul_ps(l, _mm_set1_ps((float)lms2rgb_matrix[2][0]));
-+    *ds = _mm_add_ps(*ds, _mm_mul_ps(m, _mm_set1_ps((float)lms2rgb_matrix[2][1])));
-+    *ds = _mm_add_ps(*ds, _mm_mul_ps(s, _mm_set1_ps((float)lms2rgb_matrix[2][2])));
-+}
++            ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0));
++            px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2));
 +
-+X86_64_V2 static inline void tonemap_int32x4_sse(__m128i r_in, __m128i g_in, __m128i b_in,
-+                                                 int16_t *r_out, int16_t *g_out, int16_t *b_out,
-+                                                 float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
-+                                                 const AVLumaCoefficients *coeffs,
-+                                                 const AVLumaCoefficients *ocoeffs, double desat,
-+                                                 double (*rgb2rgb)[3][3],
-+                                                 int rgb2rgb_passthrough)
-+{
-+    __m128i sig4;
-+    __m128 mapvalx4, r_linx4, g_linx4, b_linx4;
-+    __m128 offset = _mm_set1_ps(0.5f);
-+    __m128 intermediate_upper_bound = _mm_set1_ps(JPEG_SCALE);
-+    __m128i r, g, b, rx4, gx4, bx4;
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix);
 +
-+    float mapval4[4], r_lin4[4], g_lin4[4], b_lin4[4];
++            rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(JPEG_SCALE));
++            gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(JPEG_SCALE));
++            bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(JPEG_SCALE));
 +
-+    r = av_clip_uint16_sse(r_in);
-+    g = av_clip_uint16_sse(g_in);
-+    b = av_clip_uint16_sse(b_in);
++            r1x4b = _mm_cvtps_epi32(rx4b);
++            r1x4b = av_clip_int16_sse(r1x4b);
++            g1x4b = _mm_cvtps_epi32(gx4b);
++            g1x4b = av_clip_int16_sse(g1x4b);
++            b1x4b = _mm_cvtps_epi32(bx4b);
++            b1x4b = av_clip_int16_sse(b1x4b);
 +
-+    sig4 = _mm_max_epi32(r, _mm_max_epi32(g, b));
++            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
 +
-+    // Cannot use loop here as the lane has to be compile-time constant
-+#define LOAD_LUT(i) mapval4[i] = tonemap_lut[_mm_extract_epi32(sig4, i)]; \
-+r_lin4[i] = lin_lut[_mm_extract_epi32(r, i)];                             \
-+g_lin4[i] = lin_lut[_mm_extract_epi32(g, i)];                             \
-+b_lin4[i] = lin_lut[_mm_extract_epi32(b, i)];
++            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
++            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
++            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
 +
-+    LOAD_LUT(0)
-+    LOAD_LUT(1)
-+    LOAD_LUT(2)
-+    LOAD_LUT(3)
++            roax4 = _mm_cvtepi16_epi32(r0ox8);
++            goax4 = _mm_cvtepi16_epi32(g0ox8);
++            boax4 = _mm_cvtepi16_epi32(b0ox8);
 +
-+#undef LOAD_LUT
++            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
++            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
++            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
 +
-+    mapvalx4 = _mm_loadu_ps(mapval4);
-+    r_linx4 = _mm_loadu_ps(r_lin4);
-+    g_linx4 = _mm_loadu_ps(g_lin4);
-+    b_linx4 = _mm_loadu_ps(b_lin4);
++            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
++            // output shift bits for 8bit outputs is 29 - 8 = 21
++            yoax4 = _mm_srai_epi32(yoax4, 21);
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+    if (!rgb2rgb_passthrough) {
-+        r_linx4 = _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][0]));
-+        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][1])));
-+        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][2])));
++            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
++            yobx4 = _mm_srai_epi32(yobx4, 21);
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+        g_linx4 = _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][1]));
-+        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][0])));
-+        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][2])));
++            y0ox8 = _mm_packs_epi32(yoax4, yobx4);
++            _mm_storeu_si64(&dsty[x], _mm_packus_epi16(y0ox8, zero128));
 +
-+        b_linx4 = _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][2]));
-+        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][0])));
-+        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][1])));
-+    }
++            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
++            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
++            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
 +
-+    if (desat > 0) {
-+        __m128 eps_x4 = _mm_set1_ps(FLOAT_EPS);
-+        __m128 desat4 = _mm_set1_ps((float)desat);
-+        __m128 luma4 = _mm_set1_ps(0);
-+        __m128 overbright4;
++            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
++            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
++            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
 +
-+        luma4 = _mm_add_ps(luma4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)av_q2d(coeffs->cr))));
-+        luma4 = _mm_add_ps(luma4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)av_q2d(coeffs->cg))));
-+        luma4 = _mm_add_ps(luma4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)av_q2d(coeffs->cb))));
-+        overbright4 = _mm_div_ps(_mm_max_ps(_mm_sub_ps(luma4, desat4), eps_x4), _mm_max_ps(luma4, eps_x4));
-+        r_linx4 = _mm_sub_ps(r_linx4, _mm_mul_ps(r_linx4, overbright4));
-+        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(luma4, overbright4));
-+        g_linx4 = _mm_sub_ps(g_linx4, _mm_mul_ps(g_linx4, overbright4));
-+        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(luma4, overbright4));
-+        b_linx4 = _mm_sub_ps(b_linx4, _mm_mul_ps(b_linx4, overbright4));
-+        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(luma4, overbright4));
-+    }
++            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
++            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
++            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
 +
-+    r_linx4 = _mm_mul_ps(r_linx4, mapvalx4);
-+    g_linx4 = _mm_mul_ps(g_linx4, mapvalx4);
-+    b_linx4 = _mm_mul_ps(b_linx4, mapvalx4);
++            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
++            y1oax4 = _mm_srai_epi32(y1oax4, 21);
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+    r_linx4 = _mm_mul_ps(r_linx4, intermediate_upper_bound);
-+    r_linx4 = _mm_add_ps(r_linx4, offset);
++            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
++            y1obx4 = _mm_srai_epi32(y1obx4, 21);
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+    g_linx4 = _mm_mul_ps(g_linx4, intermediate_upper_bound);
-+    g_linx4 = _mm_add_ps(g_linx4, offset);
++            y1ox8 = _mm_packs_epi32(y1oax4, y1obx4);
++            _mm_storeu_si64(&dsty[x + dstlinesize[0]], _mm_packus_epi16(y1ox8, zero128));
 +
-+    b_linx4 = _mm_mul_ps(b_linx4, intermediate_upper_bound);
-+    b_linx4 = _mm_add_ps(b_linx4, offset);
++            ravgx4 = _mm_hadd_epi32(roax4, robx4);
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2));
++            ravgx4 = _mm_srai_epi32(ravgx4, 2);
 +
-+    rx4 = _mm_cvttps_epi32(r_linx4);
-+    rx4 = av_clip_uint16_sse(rx4);
-+    gx4 = _mm_cvttps_epi32(g_linx4);
-+    gx4 = av_clip_uint16_sse(gx4);
-+    bx4 = _mm_cvttps_epi32(b_linx4);
-+    bx4 = av_clip_uint16_sse(bx4);
++            gavgx4 = _mm_hadd_epi32(goax4, gobx4);
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4));
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2));
++            gavgx4 = _mm_srai_epi32(gavgx4, 2);
 +
-+#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm_extract_epi32(rx4, i)]; \
-+g_out[i] = delin_lut[_mm_extract_epi32(gx4, i)];                       \
-+b_out[i] = delin_lut[_mm_extract_epi32(bx4, i)];
++            bavgx4 = _mm_hadd_epi32(boax4, bobx4);
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4));
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
++            bavgx4 = _mm_srai_epi32(bavgx4, 2);
 +
-+    SAVE_COLOR(0)
-+    SAVE_COLOR(1)
-+    SAVE_COLOR(2)
-+    SAVE_COLOR(3)
++            uox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
++            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
++            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
++            uox4 = _mm_srai_epi32(uox4, 21);
++            uox4 = _mm_add_epi32(uox4, _mm_set1_epi32(out_uv_offset));
++            _mm_storeu_si32(&dstu[x >> 1], _mm_packus_epi16(_mm_packs_epi32(uox4, zero128), zero128));
 +
-+#undef SAVE_COLOR
-+}
++            vox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
++            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
++            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
++            vox4 = _mm_srai_epi32(vox4, 21);
++            vox4 = _mm_add_epi32(vox4, _mm_set1_epi32(out_uv_offset));
++            _mm_storeu_si32(&dstv[x >> 1], _mm_packus_epi16(_mm_packs_epi32(vox4, zero128), zero128));
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff8;
++        rdsty += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
++        rsrcy += offset;
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_dovi_2_420p(rdsty, rdstu, rdstv,
++                                    rsrcy, rsrcu, rsrcv,
++                                    dstlinesize, srclinesize,
++                                    dstdepth, srcdepth,
++                                    remainw, rheight, params);
++    }
 +#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
++}
 +
-+X86_64_V2 void tonemap_frame_dovi_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                             const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                             const int *dstlinesize, const int *srclinesize,
-+                                             int dstdepth, int srcdepth,
-+                                             int width, int height,
-+                                             const struct TonemapIntParams *params)
++X86_64_V2 void tonemap_frame_dovi_2_420p10_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                               const int *dstlinesize, const int *srclinesize,
++                                               int dstdepth, int srcdepth,
++                                               int width, int height,
++                                               const struct TonemapIntParams *params)
 +{
 +#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
-+    uint8_t *rdsty = dsty;
-+    uint8_t *rdstu = dstu;
-+    uint8_t *rdstv = dstv;
-+
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstu = dstu;
++    uint16_t *rdstv = dstv;
 +    const uint16_t *rsrcy = srcy;
 +    const uint16_t *rsrcu = srcu;
 +    const uint16_t *rsrcv = srcv;
-+
 +    int rheight = height;
 +    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
@@ -7061,8 +8204,8 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +    __m128 rx4a, gx4a, bx4a, rx4b, gx4b, bx4b;
 +    __m128 y0x4af, y0x4bf, y1x4af, y1x4bf, ux4af, ux4bf, vx4af, vx4bf;
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
-+                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
 +        for (int xx = 0; xx < width >> 3; xx++) {
 +            int x = xx << 3;
 +
@@ -7306,19 +8449,18 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
 +            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
 +            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
-+            // output shift bits for 8bit outputs is 29 - 8 = 21
-+            yoax4 = _mm_srai_epi32(yoax4, 21);
++            yoax4 = _mm_srai_epi32(yoax4, out_sh);
 +            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
 +
 +            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
 +            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
 +            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
 +            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
-+            yobx4 = _mm_srai_epi32(yobx4, 21);
++            yobx4 = _mm_srai_epi32(yobx4, out_sh);
 +            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y0ox8 = _mm_packs_epi32(yoax4, yobx4);
-+            _mm_storeu_si64(&dsty[x], _mm_packus_epi16(y0ox8, zero128));
++            y0ox8 = _mm_packus_epi32(yoax4, yobx4);
++            _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8);
 +
 +            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
 +            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
@@ -7336,18 +8478,18 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
 +            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
 +            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
-+            y1oax4 = _mm_srai_epi32(y1oax4, 21);
++            y1oax4 = _mm_srai_epi32(y1oax4, out_sh);
 +            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
 +
 +            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
 +            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
 +            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
 +            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
-+            y1obx4 = _mm_srai_epi32(y1obx4, 21);
++            y1obx4 = _mm_srai_epi32(y1obx4, out_sh);
 +            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y1ox8 = _mm_packs_epi32(y1oax4, y1obx4);
-+            _mm_storeu_si64(&dsty[x + dstlinesize[0]], _mm_packus_epi16(y1ox8, zero128));
++            y1ox8 = _mm_packus_epi32(y1oax4, y1obx4);
++            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0] / 2], y1ox8);
 +
 +            ravgx4 = _mm_hadd_epi32(roax4, robx4);
 +            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
@@ -7367,16 +8509,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +            uox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
 +            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
 +            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
-+            uox4 = _mm_srai_epi32(uox4, 21);
++            uox4 = _mm_srai_epi32(uox4, out_sh);
 +            uox4 = _mm_add_epi32(uox4, _mm_set1_epi32(out_uv_offset));
-+            _mm_storeu_si32(&dstu[x >> 1], _mm_packus_epi16(_mm_packs_epi32(uox4, zero128), zero128));
++            _mm_storeu_si64((__m128i_u *) &dstu[x >> 1], _mm_packus_epi32(uox4, zero128));
 +
 +            vox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
 +            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
 +            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
-+            vox4 = _mm_srai_epi32(vox4, 21);
++            vox4 = _mm_srai_epi32(vox4, out_sh);
 +            vox4 = _mm_add_epi32(vox4, _mm_set1_epi32(out_uv_offset));
-+            _mm_storeu_si32(&dstv[x >> 1], _mm_packus_epi16(_mm_packs_epi32(vox4, zero128), zero128));
++            _mm_storeu_si64((__m128i_u *) &dstv[x >> 1], _mm_packus_epi32(vox4, zero128));
 +        }
 +    }
 +
@@ -7389,16 +8531,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +        rsrcy += offset;
 +        rsrcu += offset >> 1;
 +        rsrcv += offset >> 1;
-+        tonemap_frame_dovi_2_420p(rdsty, rdstu, rdstv,
-+                                    rsrcy, rsrcu, rsrcv,
-+                                    dstlinesize, srclinesize,
-+                                    dstdepth, srcdepth,
-+                                    remainw, rheight, params);
++        tonemap_frame_dovi_2_420p10(rdsty, rdstu, rdstv,
++                                      rsrcy, rsrcu, rsrcv,
++                                      dstlinesize, srclinesize,
++                                      dstdepth, srcdepth,
++                                      remainw, rheight, params);
 +    }
 +#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
 +}
 +
-+X86_64_V2 void tonemap_frame_dovi_2_420p10_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++X86_64_V2 void tonemap_frame_dovi_2_420hdr_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
 +                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
 +                                               const int *dstlinesize, const int *srclinesize,
 +                                               int dstdepth, int srcdepth,
@@ -7434,9 +8576,6 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[8], g[8], b[8];
-+    int16_t r1[8], g1[8], b1[8];
-+
 +    __m128i zero128 = _mm_setzero_si128();
 +    __m128i ux4, vx4;
 +    __m128i y0x8, y1x8;
@@ -7444,12 +8583,10 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
 +    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
 +
-+    __m128i r0ox8, g0ox8, b0ox8;
 +    __m128i y0ox8;
 +    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
 +    __m128i yoax4, yobx4;
 +
-+    __m128i r1ox8, g1ox8, b1ox8;
 +    __m128i y1ox8;
 +    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
 +    __m128i y1oax4, y1obx4;
@@ -7674,34 +8811,13 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +            b1x4b = _mm_cvtps_epi32(bx4b);
 +            b1x4b = av_clip_int16_sse(b1x4b);
 +
-+            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+
-+            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
-+            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
-+            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
-+
-+            roax4 = _mm_cvtepi16_epi32(r0ox8);
-+            goax4 = _mm_cvtepi16_epi32(g0ox8);
-+            boax4 = _mm_cvtepi16_epi32(b0ox8);
++            roax4 = r0x4a;
++            goax4 = g0x4a;
++            boax4 = b0x4a;
 +
-+            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
-+            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
-+            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
++            robx4 = r0x4b;
++            gobx4 = g0x4b;
++            bobx4 = b0x4b;
 +
 +            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
 +            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
@@ -7720,17 +8836,13 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +            y0ox8 = _mm_packus_epi32(yoax4, yobx4);
 +            _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8);
 +
-+            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
-+            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
-+            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
-+
-+            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
-+            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
-+            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
++            r1oax4 = r1x4a;
++            g1oax4 = g1x4a;
++            b1oax4 = b1x4a;
 +
-+            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
-+            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
-+            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
++            r1obx4 = r1x4b;
++            g1obx4 = g1x4b;
++            b1obx4 = b1x4b;
 +
 +            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
 +            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
@@ -7789,7 +8901,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +        rsrcy += offset;
 +        rsrcu += offset >> 1;
 +        rsrcv += offset >> 1;
-+        tonemap_frame_dovi_2_420p10(rdsty, rdstu, rdstv,
++        tonemap_frame_dovi_2_420hdr(rdsty, rdstu, rdstv,
 +                                      rsrcy, rsrcu, rsrcv,
 +                                      dstlinesize, srclinesize,
 +                                      dstdepth, srcdepth,
@@ -8985,7 +10097,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h
-@@ -0,0 +1,68 @@
+@@ -0,0 +1,75 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
 + *
@@ -9025,6 +10137,13 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h
 +                                               int width, int height,
 +                                               const struct TonemapIntParams *params);
 +
++X86_64_V2 void tonemap_frame_dovi_2_420hdr_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                               const int *dstlinesize, const int *srclinesize,
++                                               int dstdepth, int srcdepth,
++                                               int width, int height,
++                                               const struct TonemapIntParams *params);
++
 +X86_64_V2 void tonemap_frame_420p10_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
 +                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
 +                                               const int *dstlinesize, const int *srclinesize,

From 9ccb771f7656a3db927c23ae06d3e6179799bd7e Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Fri, 20 Dec 2024 12:53:42 +0800
Subject: [PATCH 2/3] avfilter/tonemapx: fix register type casting for neon

---
 .../0060-add-simd-optimized-tonemapx-filter.patch    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch
index 10ad3f9ee6..1637b77bf2 100644
--- a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch
+++ b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch
@@ -1909,9 +1909,9 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +            g1x8 = vminq_u16(g1x8, vdupq_n_u16(INT16_MAX));
 +            b1x8 = vminq_u16(b1x8, vdupq_n_u16(INT16_MAX));
 +
-+            r0ox8 = r0x8;
-+            g0ox8 = g0x8;
-+            b0ox8 = b0x8;
++            r0ox8 = vreinterpretq_s16_u16(r0x8);
++            g0ox8 = vreinterpretq_s16_u16(g0x8);
++            b0ox8 = vreinterpretq_s16_u16(b0x8);
 +
 +            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
 +            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
@@ -1938,9 +1938,9 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +            y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4));
 +            vst1q_u16(&dsty[x], y0ox8);
 +
-+            r1ox8 = r1x8;
-+            g1ox8 = g1x8;
-+            b1ox8 = b1x8;
++            r1ox8 = vreinterpretq_s16_u16(r1x8);
++            g1ox8 = vreinterpretq_s16_u16(g1x8);
++            b1ox8 = vreinterpretq_s16_u16(b1x8);
 +
 +            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
 +            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));

From 71adc796a8bc3fc4f76a5f4f4cfe3f7e5f89158b Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Fri, 20 Dec 2024 22:42:51 +0800
Subject: [PATCH 3/3] avfilter/tonemapx: require 10bit output for hdr

---
 .../patches/0060-add-simd-optimized-tonemapx-filter.patch  | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch
index 1637b77bf2..7fb24da65c 100644
--- a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch
+++ b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch
@@ -2635,7 +2635,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/vf_tonemapx.c
-@@ -0,0 +1,1881 @@
+@@ -0,0 +1,1886 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -4081,6 +4081,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +        return AVERROR_BUG;
 +    }
 +
++    if (s->trc == AVCOL_TRC_SMPTE2084 && odesc->comp[0].depth == 8) {
++        av_log(s, AV_LOG_ERROR, "HDR passthrough requires 10 bit output format depth\n");
++        av_assert0(0);
++    }
++
 +    switch (odesc->comp[2].plane) {
 +        case 1: // biplanar
 +            if (odesc->comp[0].depth == 8) {