From d1e08375336085474c80b96c1a55a0cc08b69678 Mon Sep 17 00:00:00 2001 From: gnattu Date: Fri, 20 Dec 2024 12:01:20 +0800 Subject: [PATCH 1/3] avfilter/tonemapx: add dovi to hdr10 support This adds a reshape-only mode for Dolby Vision videos without a compatibility layer. In this mode, only Dolby Vision reshaping will be performed, and the output will still be in SMPTE 2084 transfer. The GPU-based filters already support this mode. This will be useful in the future when we implement HDR transcoding. --- ...0-add-simd-optimized-tonemapx-filter.patch | 2709 ++++++++++++----- 1 file changed, 1914 insertions(+), 795 deletions(-) diff --git a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch index 96798a25c9..10ad3f9ee6 100644 --- a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch +++ b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch @@ -95,7 +95,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c =================================================================== --- /dev/null +++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c -@@ -0,0 +1,2022 @@ +@@ -0,0 +1,2366 @@ +/* + * Copyright (c) 2024 Gnattu OC + * @@ -1684,12 +1684,12 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS +} + -+void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, -+ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) ++void tonemap_frame_dovi_2_420hdr_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ +#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS + uint16_t *rdsty = dsty; @@ -1703,12 +1703,6 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + // intentionally leave last pixel emtpy when input is odd + int remainw = width & 6; + -+ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; -+ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; -+ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; -+ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; -+ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; -+ + int cry = (*params->rgb2yuv_coeffs)[0][0][0]; + int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; + int cby = (*params->rgb2yuv_coeffs)[0][2][0]; @@ -1718,10 +1712,6 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; + int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; + -+ int16_t r[8], g[8], b[8]; -+ int16_t r1[8], g1[8], b1[8]; -+ uint16x8_t in_yuv_offx8 = vdupq_n_u16(params->in_yuv_off); -+ uint16x8_t in_uv_offx8 = vdupq_n_u16(512); + uint16x4_t ux4, vx4; + uint16x8_t y0x8, y1x8, ux8, vx8; + uint16x8_t r0x8, g0x8, b0x8; @@ -1742,6 +1732,12 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + int32x4_t out_rndx4 = vdupq_n_s32(TEN_BIT_ROUNDING); + int32x4_t out_uv_offsetx4 = vdupq_n_s32(TEN_BIT_UV_OFFSET); + int32x4_t rgb_avg_rndx4 = vdupq_n_s32(CHROMA_AVG_ROUNDING); ++ float32x4_t ipt0, ipt1, ipt2, ipt3; ++ float32x4_t ia1, ib1, ia2, ib2; ++ float32x4_t ix4, px4, tx4; ++ float32x4_t lx4, mx4, sx4; ++ float32x4_t rx4a, gx4a, bx4a, rx4b, gx4b, bx4b; ++ float32x4_t y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b; + for (; height > 1; height -= 2, + dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2, + srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) { @@ -1752,31 +1748,170 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x)); + ux4 = vld1_u16(srcu + (x >> 1)); + vx4 = vld1_u16(srcv + (x >> 1)); -+ y0x8 = vsubq_u16(y0x8, in_yuv_offx8); -+ y0x8 = vreinterpretq_u16_s16(vmaxq_s16(vreinterpretq_s16_u16(y0x8), vdupq_n_s16(0))); -+ y1x8 = vsubq_u16(y1x8, in_yuv_offx8); -+ y1x8 = vreinterpretq_u16_s16(vmaxq_s16(vreinterpretq_s16_u16(y1x8), vdupq_n_s16(0))); + + ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4)); -+ ux8 = vsubq_u16(ux8, in_uv_offx8); + vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4)); -+ vx8 = vsubq_u16(vx8, in_uv_offx8); + -+ yuv2rgbx8(&r0x8, &g0x8, &b0x8, y0x8, ux8, vx8, cy, crv, cgu, cgv, cbu); -+ yuv2rgbx8(&r1x8, &g1x8, &b1x8, y1x8, ux8, vx8, cy, crv, cgu, cgv, cbu); ++ y0x4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(y0x8))); ++ y0x4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(y0x8))); ++ y1x4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(y1x8))); ++ y1x4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(y1x8))); + -+ tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b, -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, -+ params->rgb2rgb_passthrough); -+ tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1, -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, -+ params->rgb2rgb_passthrough); ++ ux4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(ux8))); ++ ux4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(ux8))); ++ vx4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vx8))); ++ vx4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vx8))); + -+ r0ox8 = vld1q_s16(r); -+ g0ox8 = vld1q_s16(g); -+ b0ox8 = vld1q_s16(b); ++ y0x4a = vdivq_f32(y0x4a, vdupq_n_f32(TEN_BIT_SCALE)); ++ y0x4b = vdivq_f32(y0x4b, vdupq_n_f32(TEN_BIT_SCALE)); ++ y1x4a = vdivq_f32(y1x4a, vdupq_n_f32(TEN_BIT_SCALE)); ++ y1x4b = vdivq_f32(y1x4b, vdupq_n_f32(TEN_BIT_SCALE)); ++ ux4a = vdivq_f32(ux4a, vdupq_n_f32(TEN_BIT_SCALE)); ++ ux4b = vdivq_f32(ux4b, vdupq_n_f32(TEN_BIT_SCALE)); ++ vx4a = vdivq_f32(vx4a, vdupq_n_f32(TEN_BIT_SCALE)); ++ vx4b = vdivq_f32(vx4b, vdupq_n_f32(TEN_BIT_SCALE)); ++ ++ // Reshape y0x4a ++ ia1 = vzip1q_f32(y0x4a, ux4a); ++ ia2 = vzip2q_f32(y0x4a, ux4a); ++ ib1 = vzip1q_f32(vx4a, vdupq_n_f32(0.0f)); ++ ib2 = vzip2q_f32(vx4a, vdupq_n_f32(0.0f)); ++ ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1)); ++ ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1)); ++ ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2)); ++ ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ia1 = vtrn1q_f32(ipt0, ipt1); ++ ia2 = vtrn1q_f32(ipt2, ipt3); ++ ib1 = vtrn2q_f32(ipt0, ipt1); ++ ib2 = vtrn2q_f32(ipt2, ipt3); ++ ++ ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2)); ++ px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2)); ++ tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4a = vmulq_n_f32(rx4a, JPEG_SCALE); ++ gx4a = vmulq_n_f32(gx4a, JPEG_SCALE); ++ bx4a = vmulq_n_f32(bx4a, JPEG_SCALE); ++ ++ // Reshape y0x4b ++ ia1 = vzip1q_f32(y0x4b, ux4b); ++ ia2 = vzip2q_f32(y0x4b, ux4b); ++ ib1 = vzip1q_f32(vx4b, vdupq_n_f32(0.0f)); ++ ib2 = vzip2q_f32(vx4b, vdupq_n_f32(0.0f)); ++ ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1)); ++ ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1)); ++ ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2)); ++ ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ia1 = vtrn1q_f32(ipt0, ipt1); ++ ia2 = vtrn1q_f32(ipt2, ipt3); ++ ib1 = vtrn2q_f32(ipt0, ipt1); ++ ib2 = vtrn2q_f32(ipt2, ipt3); ++ ++ ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2)); ++ px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2)); ++ tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4b = vmulq_n_f32(rx4b, JPEG_SCALE); ++ gx4b = vmulq_n_f32(gx4b, JPEG_SCALE); ++ bx4b = vmulq_n_f32(bx4b, JPEG_SCALE); ++ ++ r0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(rx4a)), vqmovn_u32(vcvtq_u32_f32(rx4b))); ++ g0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(gx4a)), vqmovn_u32(vcvtq_u32_f32(gx4b))); ++ b0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(bx4a)), vqmovn_u32(vcvtq_u32_f32(bx4b))); ++ r0x8 = vminq_u16(r0x8, vdupq_n_u16(INT16_MAX)); ++ g0x8 = vminq_u16(g0x8, vdupq_n_u16(INT16_MAX)); ++ b0x8 = vminq_u16(b0x8, vdupq_n_u16(INT16_MAX)); ++ ++ // Reshape y1x4a ++ ia1 = vzip1q_f32(y1x4a, ux4a); ++ ia2 = vzip2q_f32(y1x4a, ux4a); ++ ib1 = vzip1q_f32(vx4a, vdupq_n_f32(0.0f)); ++ ib2 = vzip2q_f32(vx4a, vdupq_n_f32(0.0f)); ++ ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1)); ++ ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1)); ++ ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2)); ++ ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ia1 = vtrn1q_f32(ipt0, ipt1); ++ ia2 = vtrn1q_f32(ipt2, ipt3); ++ ib1 = vtrn2q_f32(ipt0, ipt1); ++ ib2 = vtrn2q_f32(ipt2, ipt3); ++ ++ ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2)); ++ px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2)); ++ tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4a = vmulq_n_f32(rx4a, JPEG_SCALE); ++ gx4a = vmulq_n_f32(gx4a, JPEG_SCALE); ++ bx4a = vmulq_n_f32(bx4a, JPEG_SCALE); ++ ++ // Reshape y1x4b ++ ia1 = vzip1q_f32(y1x4b, ux4b); ++ ia2 = vzip2q_f32(y1x4b, ux4b); ++ ib1 = vzip1q_f32(vx4b, vdupq_n_f32(0.0f)); ++ ib2 = vzip2q_f32(vx4b, vdupq_n_f32(0.0f)); ++ ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1)); ++ ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1)); ++ ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2)); ++ ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ia1 = vtrn1q_f32(ipt0, ipt1); ++ ia2 = vtrn1q_f32(ipt2, ipt3); ++ ib1 = vtrn2q_f32(ipt0, ipt1); ++ ib2 = vtrn2q_f32(ipt2, ipt3); ++ ++ ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2)); ++ px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2)); ++ tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4b = vmulq_n_f32(rx4b, JPEG_SCALE); ++ gx4b = vmulq_n_f32(gx4b, JPEG_SCALE); ++ bx4b = vmulq_n_f32(bx4b, JPEG_SCALE); ++ ++ r1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(rx4a)), vqmovn_u32(vcvtq_u32_f32(rx4b))); ++ g1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(gx4a)), vqmovn_u32(vcvtq_u32_f32(gx4b))); ++ b1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(bx4a)), vqmovn_u32(vcvtq_u32_f32(bx4b))); ++ r1x8 = vminq_u16(r1x8, vdupq_n_u16(INT16_MAX)); ++ g1x8 = vminq_u16(g1x8, vdupq_n_u16(INT16_MAX)); ++ b1x8 = vminq_u16(b1x8, vdupq_n_u16(INT16_MAX)); ++ ++ r0ox8 = r0x8; ++ g0ox8 = g0x8; ++ b0ox8 = b0x8; + + r0oax4 = vmovl_s16(vget_low_s16(r0ox8)); + g0oax4 = vmovl_s16(vget_low_s16(g0ox8)); @@ -1803,9 +1938,9 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4)); + vst1q_u16(&dsty[x], y0ox8); + -+ r1ox8 = vld1q_s16(r1); -+ g1ox8 = vld1q_s16(g1); -+ b1ox8 = vld1q_s16(b1); ++ r1ox8 = r1x8; ++ g1ox8 = g1x8; ++ b1ox8 = b1x8; + + r1oax4 = vmovl_s16(vget_low_s16(r1ox8)); + g1oax4 = vmovl_s16(vget_low_s16(g1ox8)); @@ -1884,27 +2019,29 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + rsrcy += offset; + rsrcu += offset >> 1; + rsrcv += offset >> 1; -+ tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv, -+ rsrcy, rsrcu, rsrcv, -+ dstlinesize, srclinesize, -+ dstdepth, srcdepth, -+ remainw, rheight, params); ++ tonemap_frame_dovi_2_420hdr(rdsty, rdstu, rdstv, ++ rsrcy, rsrcu, rsrcv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); + } +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS +} + -+void tonemap_frame_p010_2_p010_neon(uint16_t *dsty, uint16_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) ++void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ +#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS + uint16_t *rdsty = dsty; -+ uint16_t *rdstuv = dstuv; ++ uint16_t *rdstu = dstu; ++ uint16_t *rdstv = dstv; + const uint16_t *rsrcy = srcy; -+ const uint16_t *rsrcuv = srcuv; ++ const uint16_t *rsrcu = srcu; ++ const uint16_t *rsrcv = srcv; + int rheight = height; + // not zero when not divisible by 8 + // intentionally leave last pixel emtpy when input is odd @@ -1928,9 +2065,8 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + int16_t r[8], g[8], b[8]; + int16_t r1[8], g1[8], b1[8]; + uint16x8_t in_yuv_offx8 = vdupq_n_u16(params->in_yuv_off); -+ uint16x8_t in_uv_offx8 = vdupq_n_u16(TEN_BIT_UV_OFFSET); -+ uint16x8_t uvx8; -+ uint16x4_t ux2a, vx2a, ux2b, vx2b; ++ uint16x8_t in_uv_offx8 = vdupq_n_u16(512); ++ uint16x4_t ux4, vx4; + uint16x8_t y0x8, y1x8, ux8, vx8; + uint16x8_t r0x8, g0x8, b0x8; + uint16x8_t r1x8, g1x8, b1x8; @@ -1944,7 +2080,6 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + uint16x8_t y1ox8; + int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4; + int32x4_t y1oax4, y1obx4; -+ int32x4_t uvoax4, uvobx4; + int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2; + int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4; + int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off); @@ -1952,31 +2087,240 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + int32x4_t out_uv_offsetx4 = vdupq_n_s32(TEN_BIT_UV_OFFSET); + int32x4_t rgb_avg_rndx4 = vdupq_n_s32(CHROMA_AVG_ROUNDING); + for (; height > 1; height -= 2, -+ dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2, -+ srcy += srclinesize[0], srcuv += srclinesize[1] / 2) { ++ dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2, ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) { + for (int xx = 0; xx < width >> 3; xx++) { + int x = xx << 3; + + y0x8 = vld1q_u16(srcy + x); + y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x)); -+ uvx8 = vld1q_u16(srcuv + x); -+ // shift to low10bits for 10bit input -+ // shift bit has to be compile-time constant -+ y0x8 = vshrq_n_u16(y0x8, TEN_BIT_BIPLANAR_SHIFT); -+ y1x8 = vshrq_n_u16(y1x8, TEN_BIT_BIPLANAR_SHIFT); -+ uvx8 = vshrq_n_u16(uvx8, TEN_BIT_BIPLANAR_SHIFT); ++ ux4 = vld1_u16(srcu + (x >> 1)); ++ vx4 = vld1_u16(srcv + (x >> 1)); + y0x8 = vsubq_u16(y0x8, in_yuv_offx8); + y0x8 = vreinterpretq_u16_s16(vmaxq_s16(vreinterpretq_s16_u16(y0x8), vdupq_n_s16(0))); + y1x8 = vsubq_u16(y1x8, in_yuv_offx8); + y1x8 = vreinterpretq_u16_s16(vmaxq_s16(vreinterpretq_s16_u16(y1x8), vdupq_n_s16(0))); -+ uvx8 = vsubq_u16(uvx8, in_uv_offx8); -+ -+ ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2); -+ vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2); -+ ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2); -+ vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2); + -+ ux8 = vcombine_u16(ux2a, ux2b); ++ ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4)); ++ ux8 = vsubq_u16(ux8, in_uv_offx8); ++ vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4)); ++ vx8 = vsubq_u16(vx8, in_uv_offx8); ++ ++ yuv2rgbx8(&r0x8, &g0x8, &b0x8, y0x8, ux8, vx8, cy, crv, cgu, cgv, cbu); ++ yuv2rgbx8(&r1x8, &g1x8, &b1x8, y1x8, ux8, vx8, cy, crv, cgu, cgv, cbu); ++ ++ tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ ++ r0ox8 = vld1q_s16(r); ++ g0ox8 = vld1q_s16(g); ++ b0ox8 = vld1q_s16(b); ++ ++ r0oax4 = vmovl_s16(vget_low_s16(r0ox8)); ++ g0oax4 = vmovl_s16(vget_low_s16(g0ox8)); ++ b0oax4 = vmovl_s16(vget_low_s16(b0ox8)); ++ ++ r0obx4 = vmovl_s16(vget_high_s16(r0ox8)); ++ g0obx4 = vmovl_s16(vget_high_s16(g0ox8)); ++ b0obx4 = vmovl_s16(vget_high_s16(b0ox8)); ++ ++ y0oax4 = vmulq_n_s32(r0oax4, cry); ++ y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy); ++ y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby); ++ y0oax4 = vaddq_s32(y0oax4, out_rndx4); ++ y0oax4 = vshrq_n_s32(y0oax4, TEN_BIT_SCALE_SHIFT); ++ y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4); ++ ++ y0obx4 = vmulq_n_s32(r0obx4, cry); ++ y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy); ++ y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby); ++ y0obx4 = vaddq_s32(y0obx4, out_rndx4); ++ y0obx4 = vshrq_n_s32(y0obx4, TEN_BIT_SCALE_SHIFT); ++ y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4); ++ ++ y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4)); ++ vst1q_u16(&dsty[x], y0ox8); ++ ++ r1ox8 = vld1q_s16(r1); ++ g1ox8 = vld1q_s16(g1); ++ b1ox8 = vld1q_s16(b1); ++ ++ r1oax4 = vmovl_s16(vget_low_s16(r1ox8)); ++ g1oax4 = vmovl_s16(vget_low_s16(g1ox8)); ++ b1oax4 = vmovl_s16(vget_low_s16(b1ox8)); ++ ++ r1obx4 = vmovl_s16(vget_high_s16(r1ox8)); ++ g1obx4 = vmovl_s16(vget_high_s16(g1ox8)); ++ b1obx4 = vmovl_s16(vget_high_s16(b1ox8)); ++ ++ y1oax4 = vmulq_n_s32(r1oax4, cry); ++ y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy); ++ y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby); ++ y1oax4 = vaddq_s32(y1oax4, out_rndx4); ++ y1oax4 = vshrq_n_s32(y1oax4, TEN_BIT_SCALE_SHIFT); ++ y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4); ++ ++ y1obx4 = vmulq_n_s32(r1obx4, cry); ++ y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy); ++ y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby); ++ y1obx4 = vaddq_s32(y1obx4, out_rndx4); ++ y1obx4 = vshrq_n_s32(y1obx4, TEN_BIT_SCALE_SHIFT); ++ y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4); ++ ++ y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4)); ++ vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8); ++ ++ ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4)); ++ ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4)); ++ ravgx4 = vcombine_s32(ravgax2, ravgbx2); ++ ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4)); ++ ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4)); ++ ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2)); ++ ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4); ++ ravgx4 = vshrq_n_s32(ravgx4, CHROMA_AVG_ROUNDING); ++ ++ gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4)); ++ gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4)); ++ gavgx4 = vcombine_s32(gavgax2, gavgbx2); ++ gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4)); ++ gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4)); ++ gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2)); ++ gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4); ++ gavgx4 = vshrq_n_s32(gavgx4, CHROMA_AVG_ROUNDING); ++ ++ bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4)); ++ bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4)); ++ bavgx4 = vcombine_s32(bavgax2, bavgbx2); ++ bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4)); ++ bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4)); ++ bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2)); ++ bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4); ++ bavgx4 = vshrq_n_s32(bavgx4, CHROMA_AVG_ROUNDING); ++ ++ uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru); ++ uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu); ++ uox4 = vmlaq_n_s32(uox4, bavgx4, cburv); ++ uox4 = vshrq_n_s32(uox4, TEN_BIT_SCALE_SHIFT); ++ uox4 = vaddq_s32(uox4, out_uv_offsetx4); ++ vst1_u16(&dstu[x >> 1], vqmovun_s32(uox4)); ++ ++ vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv); ++ vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv); ++ vox4 = vmlaq_n_s32(vox4, bavgx4, cbv); ++ vox4 = vshrq_n_s32(vox4, TEN_BIT_SCALE_SHIFT); ++ vox4 = vaddq_s32(vox4, out_uv_offsetx4); ++ vst1_u16(&dstv[x >> 1], vqmovun_s32(vox4)); ++ } ++ } ++ ++ // Process remaining pixels cannot fill the full simd register with scalar version ++ if (remainw) { ++ int offset = width & (int)0xfffffff8; ++ rdsty += offset; ++ rdstu += offset >> 1; ++ rdstv += offset >> 1; ++ rsrcy += offset; ++ rsrcu += offset >> 1; ++ rsrcv += offset >> 1; ++ tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv, ++ rsrcy, rsrcu, rsrcv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); ++ } ++#endif // ENABLE_TONEMAPX_NEON_INTRINSICS ++} ++ ++void tonemap_frame_p010_2_p010_neon(uint16_t *dsty, uint16_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) ++{ ++#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS ++ uint16_t *rdsty = dsty; ++ uint16_t *rdstuv = dstuv; ++ const uint16_t *rsrcy = srcy; ++ const uint16_t *rsrcuv = srcuv; ++ int rheight = height; ++ // not zero when not divisible by 8 ++ // intentionally leave last pixel emtpy when input is odd ++ int remainw = width & 6; ++ ++ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; ++ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; ++ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; ++ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; ++ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; ++ ++ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; ++ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; ++ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; ++ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; ++ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; ++ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; ++ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; ++ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; ++ ++ int16_t r[8], g[8], b[8]; ++ int16_t r1[8], g1[8], b1[8]; ++ uint16x8_t in_yuv_offx8 = vdupq_n_u16(params->in_yuv_off); ++ uint16x8_t in_uv_offx8 = vdupq_n_u16(TEN_BIT_UV_OFFSET); ++ uint16x8_t uvx8; ++ uint16x4_t ux2a, vx2a, ux2b, vx2b; ++ uint16x8_t y0x8, y1x8, ux8, vx8; ++ uint16x8_t r0x8, g0x8, b0x8; ++ uint16x8_t r1x8, g1x8, b1x8; ++ ++ int16x8_t r0ox8, g0ox8, b0ox8; ++ uint16x8_t y0ox8; ++ int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4; ++ int32x4_t y0oax4, y0obx4; ++ ++ int16x8_t r1ox8, g1ox8, b1ox8; ++ uint16x8_t y1ox8; ++ int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4; ++ int32x4_t y1oax4, y1obx4; ++ int32x4_t uvoax4, uvobx4; ++ int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2; ++ int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4; ++ int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off); ++ int32x4_t out_rndx4 = vdupq_n_s32(TEN_BIT_ROUNDING); ++ int32x4_t out_uv_offsetx4 = vdupq_n_s32(TEN_BIT_UV_OFFSET); ++ int32x4_t rgb_avg_rndx4 = vdupq_n_s32(CHROMA_AVG_ROUNDING); ++ for (; height > 1; height -= 2, ++ dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2, ++ srcy += srclinesize[0], srcuv += srclinesize[1] / 2) { ++ for (int xx = 0; xx < width >> 3; xx++) { ++ int x = xx << 3; ++ ++ y0x8 = vld1q_u16(srcy + x); ++ y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x)); ++ uvx8 = vld1q_u16(srcuv + x); ++ // shift to low10bits for 10bit input ++ // shift bit has to be compile-time constant ++ y0x8 = vshrq_n_u16(y0x8, TEN_BIT_BIPLANAR_SHIFT); ++ y1x8 = vshrq_n_u16(y1x8, TEN_BIT_BIPLANAR_SHIFT); ++ uvx8 = vshrq_n_u16(uvx8, TEN_BIT_BIPLANAR_SHIFT); ++ y0x8 = vsubq_u16(y0x8, in_yuv_offx8); ++ y0x8 = vreinterpretq_u16_s16(vmaxq_s16(vreinterpretq_s16_u16(y0x8), vdupq_n_s16(0))); ++ y1x8 = vsubq_u16(y1x8, in_yuv_offx8); ++ y1x8 = vreinterpretq_u16_s16(vmaxq_s16(vreinterpretq_s16_u16(y1x8), vdupq_n_s16(0))); ++ uvx8 = vsubq_u16(uvx8, in_uv_offx8); ++ ++ ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2); ++ vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2); ++ ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2); ++ vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2); ++ ++ ux8 = vcombine_u16(ux2a, ux2b); + vx8 = vcombine_u16(vx2a, vx2b); + + yuv2rgbx8(&r0x8, &g0x8, &b0x8, y0x8, ux8, vx8, cy, crv, cgu, cgv, cbu); @@ -2122,7 +2466,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h =================================================================== --- /dev/null +++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h -@@ -0,0 +1,68 @@ +@@ -0,0 +1,75 @@ +/* + * Copyright (c) 2024 Gnattu OC + * @@ -2176,6 +2520,13 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h + int width, int height, + const struct TonemapIntParams *params); + ++void tonemap_frame_dovi_2_420hdr_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ +void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, + const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, + const int *dstlinesize, const int *srclinesize, @@ -2271,7 +2622,7 @@ Index: FFmpeg/libavfilter/colorspace.h =================================================================== --- FFmpeg.orig/libavfilter/colorspace.h +++ FFmpeg/libavfilter/colorspace.h -@@ -85,4 +85,8 @@ float eotf_arib_b67(float x); +@@ -109,4 +109,8 @@ float eotf_arib_b67(float x); float inverse_eotf_arib_b67(float x); float inverse_eotf_bt1886(float x); @@ -2284,7 +2635,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c =================================================================== --- /dev/null +++ FFmpeg/libavfilter/vf_tonemapx.c -@@ -0,0 +1,1799 @@ +@@ -0,0 +1,1881 @@ +/* + * This file is part of FFmpeg. + * @@ -3354,7 +3705,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + } +} + -+void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++void tonemap_frame_dovi_2_420hdr(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, + const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, + const int *dstlinesize, const int *srclinesize, + int dstdepth, int srcdepth, @@ -3362,21 +3713,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + const struct TonemapIntParams *params) +{ + const int in_depth = srcdepth; -+ const int in_uv_offset = 128 << (in_depth - 8); -+ const int in_sh = in_depth - 1; -+ const int in_rnd = 1 << (in_sh - 1); -+ + const int out_depth = dstdepth; + const int out_uv_offset = 128 << (out_depth - 8); + const int out_sh = 29 - out_depth; + const int out_rnd = 1 << (out_sh - 1); + -+ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; -+ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; -+ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; -+ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; -+ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; -+ + int cry = (*params->rgb2yuv_coeffs)[0][0][0]; + int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; + int cby = (*params->rgb2yuv_coeffs)[0][2][0]; @@ -3391,19 +3732,90 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + int r10, g10, b10; + int r11, g11, b11; + ++ const float in_rng = (float)((1 << in_depth) - 1); ++ + int16_t r[4], g[4], b[4]; + for (; height > 1; height -= 2, -+ dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2], -+ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) { ++ dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2, ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) { + for (int x = 0; x < width; x += 2) { -+ int y00 = (srcy[x] ) - params->in_yuv_off; -+ int y01 = (srcy[x + 1] ) - params->in_yuv_off; -+ int y10 = (srcy[srclinesize[0] / 2 + x] ) - params->in_yuv_off; -+ int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off; -+ int u = (srcu[x >> 1]) - in_uv_offset; -+ int v = (srcv[x >> 1]) - in_uv_offset; ++ int y00 = (srcy[x] ); ++ int y01 = (srcy[x + 1] ); ++ int y10 = (srcy[srclinesize[0] / 2 + x] ); ++ int y11 = (srcy[srclinesize[0] / 2 + x + 1]); ++ int u = (srcu[x >> 1]); ++ int v = (srcv[x >> 1]); + -+ r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh); ++ dovi2rgb(y00, y01, y10, y11, u, v, params, in_rng, r, g, b); ++ ++ r00 = r[0], g00 = g[0], b00 = b[0]; ++ r01 = r[1], g01 = g[1], b01 = b[1]; ++ r10 = r[2], g10 = g[2], b10 = b[2]; ++ r11 = r[3], g11 = g[3], b11 = b[3]; ++ ++ dsty[x] = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)), 10); ++ dsty[x + 1] = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)), 10); ++ dsty[dstlinesize[0] / 2 + x] = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)), 10); ++ dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)), 10); ++ ++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2) ++ dstu[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)), 10); ++ dstv[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)), 10); ++#undef AVG ++ } ++ } ++} ++ ++void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) ++{ ++ const int in_depth = srcdepth; ++ const int in_uv_offset = 128 << (in_depth - 8); ++ const int in_sh = in_depth - 1; ++ const int in_rnd = 1 << (in_sh - 1); ++ ++ const int out_depth = dstdepth; ++ const int out_uv_offset = 128 << (out_depth - 8); ++ const int out_sh = 29 - out_depth; ++ const int out_rnd = 1 << (out_sh - 1); ++ ++ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; ++ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; ++ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; ++ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; ++ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; ++ ++ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; ++ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; ++ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; ++ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; ++ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; ++ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; ++ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; ++ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; ++ ++ int r00, g00, b00; ++ int r01, g01, b01; ++ int r10, g10, b10; ++ int r11, g11, b11; ++ ++ int16_t r[4], g[4], b[4]; ++ for (; height > 1; height -= 2, ++ dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2], ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) { ++ for (int x = 0; x < width; x += 2) { ++ int y00 = (srcy[x] ) - params->in_yuv_off; ++ int y01 = (srcy[x + 1] ) - params->in_yuv_off; ++ int y10 = (srcy[srclinesize[0] / 2 + x] ) - params->in_yuv_off; ++ int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off; ++ int u = (srcu[x >> 1]) - in_uv_offset; ++ int v = (srcv[x >> 1]) - in_uv_offset; ++ ++ r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh); + r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh); + r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh); + r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh); @@ -3804,8 +4216,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + + av_frame_free(&in); + -+ av_frame_remove_side_data(out, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA); -+ av_frame_remove_side_data(out, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL); ++ if (s->trc !=AVCOL_TRC_SMPTE2084) { ++ av_frame_remove_side_data(out, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA); ++ av_frame_remove_side_data(out, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL); ++ } ++ + av_frame_remove_side_data(out, AV_FRAME_DATA_DOVI_RPU_BUFFER); + av_frame_remove_side_data(out, AV_FRAME_DATA_DOVI_METADATA); + @@ -3896,6 +4311,23 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + av_log(s, AV_LOG_DEBUG, "Requested output format: %s\n", + s->format_str); + ++ if (s->trc == AVCOL_TRC_SMPTE2084) { ++ if (s->spc != AVCOL_SPC_BT2020_NCL) { ++ av_log(s, AV_LOG_ERROR, "HDR passthrough requires BT2020 Non-constant luminance matrix\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ if (s->pri != AVCOL_PRI_BT2020) { ++ av_log(s, AV_LOG_ERROR, "HDR passthrough requires BT2020 primaries\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ if (!s->apply_dovi) { ++ av_log(s, AV_LOG_ERROR, "HDR passthrough only works for Dolby Vision inputs at the moment\n"); ++ return AVERROR(EINVAL); ++ } ++ } ++ +#if ARCH_AARCH64 +#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS + { @@ -3906,7 +4338,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_neon; + s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_neon; + s->tonemap_func_dovi8 = tonemap_frame_dovi_2_420p_neon; -+ s->tonemap_func_dovi10 = tonemap_frame_dovi_2_420p10_neon; ++ s->tonemap_func_dovi10 = s->trc == AVCOL_TRC_SMPTE2084 ? tonemap_frame_dovi_2_420hdr_neon : tonemap_frame_dovi_2_420p10_neon; + active_simd = SIMD_NEON; + } + } @@ -3923,7 +4355,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_sse; + s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_sse; + s->tonemap_func_dovi8 = tonemap_frame_dovi_2_420p_sse; -+ s->tonemap_func_dovi10 = tonemap_frame_dovi_2_420p10_sse; ++ s->tonemap_func_dovi10 = s->trc == AVCOL_TRC_SMPTE2084 ? tonemap_frame_dovi_2_420hdr_sse : tonemap_frame_dovi_2_420p10_sse; + active_simd = SIMD_SSE; + } + } @@ -3939,7 +4371,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_avx; + s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_avx; + s->tonemap_func_dovi8 = tonemap_frame_dovi_2_420p_avx; -+ s->tonemap_func_dovi10 = tonemap_frame_dovi_2_420p10_avx; ++ s->tonemap_func_dovi10 = s->trc == AVCOL_TRC_SMPTE2084 ? tonemap_frame_dovi_2_420hdr_avx : tonemap_frame_dovi_2_420p10_avx; + active_simd = SIMD_AVX; + } + } @@ -3975,7 +4407,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + } + + if (!s->tonemap_func_dovi10) { -+ s->tonemap_func_dovi10 = tonemap_frame_dovi_2_420p10; ++ s->tonemap_func_dovi10 = s->trc == AVCOL_TRC_SMPTE2084 ? tonemap_frame_dovi_2_420hdr : tonemap_frame_dovi_2_420p10; + } + + switch (active_simd) { @@ -4040,6 +4472,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + { "t", "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, .unit = "transfer" }, + { "bt709", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709}, 0, 0, FLAGS, .unit = "transfer" }, + { "bt2020", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10}, 0, 0, FLAGS, .unit = "transfer" }, ++ { "smpte2084",0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTE2084}, 0, 0, FLAGS, .unit = "transfer" }, + { "matrix", "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, .unit = "matrix" }, + { "m", "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, .unit = "matrix" }, + { "bt709", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709}, 0, 0, FLAGS, .unit = "matrix" }, @@ -4088,7 +4521,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.h =================================================================== --- /dev/null +++ FFmpeg/libavfilter/vf_tonemapx.h -@@ -0,0 +1,137 @@ +@@ -0,0 +1,144 @@ +/* + * This file is part of FFmpeg. + * @@ -4211,6 +4644,13 @@ Index: FFmpeg/libavfilter/vf_tonemapx.h + int width, int height, + const struct TonemapIntParams *params); + ++void tonemap_frame_dovi_2_420hdr(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ +void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, + const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, + const int *dstlinesize, const int *srclinesize, @@ -4243,7 +4683,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c =================================================================== --- /dev/null +++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c -@@ -0,0 +1,2289 @@ +@@ -0,0 +1,2584 @@ +/* + * Copyright (c) 2024 Gnattu OC + * @@ -5331,7 +5771,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS +} + -+X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++X86_64_V3 void tonemap_frame_dovi_2_420hdr_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, + const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, + const int *dstlinesize, const int *srclinesize, + int dstdepth, int srcdepth, @@ -5339,33 +5779,25 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + const struct TonemapIntParams *params) +{ +#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS -+ uint8_t *rdsty = dsty; -+ uint8_t *rdstu = dstu; -+ uint8_t *rdstv = dstv; ++ uint16_t *rdsty = dsty; ++ uint16_t *rdstu = dstu; ++ uint16_t *rdstv = dstv; + const uint16_t *rsrcy = srcy; + const uint16_t *rsrcu = srcu; + const uint16_t *rsrcv = srcv; + int rheight = height; -+ // not zero when not divisible by 16 ++ // not zero when not divisible by 8 + // intentionally leave last pixel emtpy when input is odd + int remainw = width & 14; + + const int in_depth = srcdepth; -+ const int in_uv_offset = 128 << (in_depth - 8); -+ const int in_sh = in_depth - 1; -+ const int in_rnd = 1 << (in_sh - 1); ++ const float in_rng = (float)((1 << in_depth) - 1); + + const int out_depth = dstdepth; + const int out_uv_offset = 128 << (out_depth - 8); + const int out_sh = 29 - out_depth; + const int out_rnd = 1 << (out_sh - 1); + -+ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; -+ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; -+ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; -+ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; -+ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; -+ + int cry = (*params->rgb2yuv_coeffs)[0][0][0]; + int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; + int cby = (*params->rgb2yuv_coeffs)[0][2][0]; @@ -5375,32 +5807,29 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; + int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; + -+ int16_t r[16], g[16], b[16]; -+ int16_t r1[16], g1[16], b1[16]; -+ __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off); -+ __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset); -+ __m256i cyx8 = _mm256_set1_epi32(cy); -+ __m256i rndx8 = _mm256_set1_epi32(in_rnd); -+ + __m256i ux8, vx8; + __m256i y0x16, y1x16; + __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b; + __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b; + __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b; + -+ __m256i r0ox16, g0ox16, b0ox16; + __m256i y0ox16; + __m256i roax8, robx8, goax8, gobx8, boax8, bobx8; + __m256i yoax8, yobx8; + -+ __m256i r1ox16, g1ox16, b1ox16; + __m256i y1ox16; + __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8; + __m256i y1oax8, y1obx8; + __m256i uox8, vox8, ravgx8, gavgx8, bavgx8; ++ ++ __m128 ipt0, ipt1, ipt2, ipt3, ipt4, ipt5, ipt6, ipt7; ++ __m256 ix8, px8, tx8; ++ __m256 lx8, mx8, sx8; ++ __m256 rx8a, gx8a, bx8a, rx8b, gx8b, bx8b; ++ __m256 y0x8af, y0x8bf, y1x8af, y1x8bf, ux8af, ux8bf, vx8af, vx8bf; + for (; height > 1; height -= 2, -+ dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2], -+ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) { ++ dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2, ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) { + for (int xx = 0; xx < width >> 4; xx++) { + int x = xx << 4; + @@ -5414,117 +5843,128 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0)); + y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1)); + -+ y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8); -+ y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8); -+ y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8); -+ y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8); -+ ux8 = _mm256_sub_epi32(ux8, in_uv_offx8); -+ vx8 = _mm256_sub_epi32(vx8, in_uv_offx8); -+ + ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); + ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); + vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); + vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); + -+ // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh); -+ r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8); -+ r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv))); -+ r0x8a = _mm256_add_epi32(r0x8a, rndx8); -+ r0x8a = _mm256_srai_epi32(r0x8a, in_sh); -+ r0x8a = av_clip_int16_avx(r0x8a); ++ y0x8af = _mm256_cvtepi32_ps(y0x8a); ++ y0x8bf = _mm256_cvtepi32_ps(y0x8b); ++ y1x8af = _mm256_cvtepi32_ps(y1x8a); ++ y1x8bf = _mm256_cvtepi32_ps(y1x8b); ++ ux8af = _mm256_cvtepi32_ps(ux8a); ++ ux8bf = _mm256_cvtepi32_ps(ux8b); ++ vx8af = _mm256_cvtepi32_ps(vx8a); ++ vx8bf = _mm256_cvtepi32_ps(vx8b); + -+ r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8); -+ r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv))); -+ r1x8a = _mm256_add_epi32(r1x8a, rndx8); -+ r1x8a = _mm256_srai_epi32(r1x8a, in_sh); -+ r1x8a = av_clip_int16_avx(r1x8a); ++ y0x8af = _mm256_div_ps(y0x8af, _mm256_set1_ps(in_rng)); ++ y0x8bf = _mm256_div_ps(y0x8bf, _mm256_set1_ps(in_rng)); ++ y1x8af = _mm256_div_ps(y1x8af, _mm256_set1_ps(in_rng)); ++ y1x8bf = _mm256_div_ps(y1x8bf, _mm256_set1_ps(in_rng)); ++ ux8af = _mm256_div_ps(ux8af, _mm256_set1_ps(in_rng)); ++ ux8bf = _mm256_div_ps(ux8bf, _mm256_set1_ps(in_rng)); ++ vx8af = _mm256_div_ps(vx8af, _mm256_set1_ps(in_rng)); ++ vx8bf = _mm256_div_ps(vx8bf, _mm256_set1_ps(in_rng)); + -+ // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu))); -+ g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv))); -+ g0x8a = _mm256_add_epi32(g0x8a, rndx8); -+ g0x8a = _mm256_srai_epi32(g0x8a, in_sh); -+ g0x8a = av_clip_int16_avx(g0x8a); ++ // Reshape y0x8a ++ reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3, ++ &ipt4, &ipt5, &ipt6, &ipt7, ++ y0x8af, ux8af, vx8af, params); + -+ g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu))); -+ g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv))); -+ g1x8a = _mm256_add_epi32(g1x8a, rndx8); -+ g1x8a = _mm256_srai_epi32(g1x8a, in_sh); -+ g1x8a = av_clip_int16_avx(g1x8a); ++ transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3, ++ ipt4, ipt5, ipt6, ipt7, ++ &ix8, &px8, &tx8); + -+ // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh); -+ b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu))); -+ b0x8a = _mm256_add_epi32(b0x8a, rndx8); -+ b0x8a = _mm256_srai_epi32(b0x8a, in_sh); ++ ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx8(&rx8a, &gx8a, &bx8a, lx8, mx8, sx8, *params->lms2rgb_matrix); ++ ++ rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(JPEG_SCALE)); ++ gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(JPEG_SCALE)); ++ bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(JPEG_SCALE)); ++ ++ r0x8a = _mm256_cvtps_epi32(rx8a); ++ r0x8a = av_clip_int16_avx(r0x8a); ++ g0x8a = _mm256_cvtps_epi32(gx8a); ++ g0x8a = av_clip_int16_avx(g0x8a); ++ b0x8a = _mm256_cvtps_epi32(bx8a); + b0x8a = av_clip_int16_avx(b0x8a); + -+ b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu))); -+ b1x8a = _mm256_add_epi32(b1x8a, rndx8); -+ b1x8a = _mm256_srai_epi32(b1x8a, in_sh); -+ b1x8a = av_clip_int16_avx(b1x8a); ++ // Reshape y1x8a ++ reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3, ++ &ipt4, &ipt5, &ipt6, &ipt7, ++ y1x8af, ux8af, vx8af, params); + -+ r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8); -+ r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv))); -+ r0x8b = _mm256_add_epi32(r0x8b, rndx8); -+ r0x8b = _mm256_srai_epi32(r0x8b, in_sh); -+ r0x8b = av_clip_int16_avx(r0x8b); ++ transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3, ++ ipt4, ipt5, ipt6, ipt7, ++ &ix8, &px8, &tx8); + -+ r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8); -+ r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv))); -+ r1x8b = _mm256_add_epi32(r1x8b, rndx8); -+ r1x8b = _mm256_srai_epi32(r1x8b, in_sh); -+ r1x8b = av_clip_int16_avx(r1x8b); ++ ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx8(&rx8a, &gx8a, &bx8a, lx8, mx8, sx8, *params->lms2rgb_matrix); + -+ g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu))); -+ g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv))); -+ g0x8b = _mm256_add_epi32(g0x8b, rndx8); -+ g0x8b = _mm256_srai_epi32(g0x8b, in_sh); -+ g0x8b = av_clip_int16_avx(g0x8b); ++ rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(JPEG_SCALE)); ++ gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(JPEG_SCALE)); ++ bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(JPEG_SCALE)); + -+ g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu))); -+ g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv))); -+ g1x8b = _mm256_add_epi32(g1x8b, rndx8); -+ g1x8b = _mm256_srai_epi32(g1x8b, in_sh); -+ g1x8b = av_clip_int16_avx(g1x8b); ++ r1x8a = _mm256_cvtps_epi32(rx8a); ++ r1x8a = av_clip_int16_avx(r1x8a); ++ g1x8a = _mm256_cvtps_epi32(gx8a); ++ g1x8a = av_clip_int16_avx(g1x8a); ++ b1x8a = _mm256_cvtps_epi32(bx8a); ++ b1x8a = av_clip_int16_avx(b1x8a); + -+ b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu))); -+ b0x8b = _mm256_add_epi32(b0x8b, rndx8); -+ b0x8b = _mm256_srai_epi32(b0x8b, in_sh); -+ b0x8b = av_clip_int16_avx(b0x8b); ++ // Reshape y0x8b ++ reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3, ++ &ipt4, &ipt5, &ipt6, &ipt7, ++ y0x8bf, ux8bf, vx8bf, params); + -+ b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu))); -+ b1x8b = _mm256_add_epi32(b1x8b, rndx8); -+ b1x8b = _mm256_srai_epi32(b1x8b, in_sh); -+ b1x8b = av_clip_int16_avx(b1x8b); ++ transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3, ++ ipt4, ipt5, ipt6, ipt7, ++ &ix8, &px8, &tx8); + -+ tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b, -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, -+ params->rgb2rgb_passthrough); -+ tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1, -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, -+ params->rgb2rgb_passthrough); -+ tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, -+ params->rgb2rgb_passthrough); -+ tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, -+ params->rgb2rgb_passthrough); ++ ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx8(&rx8b, &gx8b, &bx8b, lx8, mx8, sx8, *params->lms2rgb_matrix); + -+ r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r); -+ g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g); -+ b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b); ++ rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(JPEG_SCALE)); ++ gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(JPEG_SCALE)); ++ bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(JPEG_SCALE)); + -+ roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0)); -+ goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0)); -+ boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0)); ++ r0x8b = _mm256_cvtps_epi32(rx8b); ++ r0x8b = av_clip_int16_avx(r0x8b); ++ g0x8b = _mm256_cvtps_epi32(gx8b); ++ g0x8b = av_clip_int16_avx(g0x8b); ++ b0x8b = _mm256_cvtps_epi32(bx8b); ++ b0x8b = av_clip_int16_avx(b0x8b); + -+ robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1)); -+ gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1)); -+ bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1)); ++ // Reshape y1x8b ++ reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3, ++ &ipt4, &ipt5, &ipt6, &ipt7, ++ y1x8bf, ux8bf, vx8bf, params); ++ ++ transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3, ++ ipt4, ipt5, ipt6, ipt7, ++ &ix8, &px8, &tx8); ++ ++ ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx8(&rx8b, &gx8b, &bx8b, lx8, mx8, sx8, *params->lms2rgb_matrix); ++ ++ rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(JPEG_SCALE)); ++ gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(JPEG_SCALE)); ++ bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(JPEG_SCALE)); ++ ++ r1x8b = _mm256_cvtps_epi32(rx8b); ++ r1x8b = av_clip_int16_avx(r1x8b); ++ g1x8b = _mm256_cvtps_epi32(gx8b); ++ g1x8b = av_clip_int16_avx(g1x8b); ++ b1x8b = _mm256_cvtps_epi32(bx8b); ++ b1x8b = av_clip_int16_avx(b1x8b); ++ ++ roax8 = r0x8a; ++ goax8 = g0x8a; ++ boax8 = b0x8a; ++ ++ robx8 = r0x8b; ++ gobx8 = g0x8b; ++ bobx8 = b0x8b; + + yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry)); + yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy))); @@ -5540,21 +5980,17 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + yobx8 = _mm256_srai_epi32(yobx8, out_sh); + yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off)); + -+ y0ox16 = _mm256_packs_epi32(yoax8, yobx8); ++ y0ox16 = _mm256_packus_epi32(yoax8, yobx8); + y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0)); -+ _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0)))); -+ -+ r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1); -+ g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1); -+ b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1); ++ _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16); + -+ r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0)); -+ g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0)); -+ b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0)); ++ r1oax8 = r1x8a; ++ g1oax8 = g1x8a; ++ b1oax8 = b1x8a; + -+ r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1)); -+ g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1)); -+ b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1)); ++ r1obx8 = r1x8b; ++ g1obx8 = g1x8b; ++ b1obx8 = b1x8b; + + y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry)); + y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy))); @@ -5570,9 +6006,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + y1obx8 = _mm256_srai_epi32(y1obx8, out_sh); + y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off)); + -+ y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8); ++ y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8); + y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0)); -+ _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0)))); ++ _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16); + + ravgx8 = _mm256_hadd_epi32(roax8, robx8); + ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8)); @@ -5597,20 +6033,18 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv))); + uox8 = _mm256_srai_epi32(uox8, out_sh); + uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset)); -+ uox8 = _mm256_packs_epi32(uox8, _mm256_setzero_si256()); ++ uox8 = _mm256_packus_epi32(uox8, _mm256_setzero_si256()); + uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0)); -+ uox8 = _mm256_packus_epi16(uox8, _mm256_setzero_si256()); -+ _mm_storeu_si64(&dstu[x >> 1], _mm256_castsi256_si128(uox8)); ++ _mm_storeu_si128((__m128i_u *) &dstu[x >> 1], _mm256_castsi256_si128(uox8)); + + vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv))); + vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv))); + vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv))); + vox8 = _mm256_srai_epi32(vox8, out_sh); + vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset)); -+ vox8 = _mm256_packs_epi32(vox8, _mm256_setzero_si256()); ++ vox8 = _mm256_packus_epi32(vox8, _mm256_setzero_si256()); + vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0)); -+ vox8 = _mm256_packus_epi16(vox8, _mm256_setzero_si256()); -+ _mm_storeu_si64(&dstv[x >> 1], _mm256_castsi256_si128(vox8)); ++ _mm_storeu_si128((__m128i_u *) &dstv[x >> 1], _mm256_castsi256_si128(vox8)); + } + } + @@ -5623,7 +6057,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + rsrcy += offset; + rsrcu += offset >> 1; + rsrcv += offset >> 1; -+ tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv, ++ tonemap_frame_dovi_2_420hdr(rdsty, rdstu, rdstv, + rsrcy, rsrcu, rsrcv, + dstlinesize, srclinesize, + dstdepth, srcdepth, @@ -5632,22 +6066,22 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS +} + -+X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, -+ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) ++X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ +#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS -+ uint16_t *rdsty = dsty; -+ uint16_t *rdstu = dstu; -+ uint16_t *rdstv = dstv; ++ uint8_t *rdsty = dsty; ++ uint8_t *rdstu = dstu; ++ uint8_t *rdstv = dstv; + const uint16_t *rsrcy = srcy; + const uint16_t *rsrcu = srcu; + const uint16_t *rsrcv = srcv; + int rheight = height; -+ // not zero when not divisible by 8 ++ // not zero when not divisible by 16 + // intentionally leave last pixel emtpy when input is odd + int remainw = width & 14; + @@ -5683,24 +6117,25 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + __m256i cyx8 = _mm256_set1_epi32(cy); + __m256i rndx8 = _mm256_set1_epi32(in_rnd); + -+ __m256i r0ox16, g0ox16, b0ox16; -+ __m256i y0ox16; -+ __m256i roax8, robx8, goax8, gobx8, boax8, bobx8; -+ __m256i yoax8, yobx8; + __m256i ux8, vx8; + __m256i y0x16, y1x16; + __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b; + __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b; + __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b; + ++ __m256i r0ox16, g0ox16, b0ox16; ++ __m256i y0ox16; ++ __m256i roax8, robx8, goax8, gobx8, boax8, bobx8; ++ __m256i yoax8, yobx8; ++ + __m256i r1ox16, g1ox16, b1ox16; + __m256i y1ox16; + __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8; + __m256i y1oax8, y1obx8; + __m256i uox8, vox8, ravgx8, gavgx8, bavgx8; + for (; height > 1; height -= 2, -+ dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2, -+ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) { ++ dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2], ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) { + for (int xx = 0; xx < width >> 4; xx++) { + int x = xx << 4; + @@ -5840,9 +6275,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + yobx8 = _mm256_srai_epi32(yobx8, out_sh); + yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off)); + -+ y0ox16 = _mm256_packus_epi32(yoax8, yobx8); ++ y0ox16 = _mm256_packs_epi32(yoax8, yobx8); + y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0)); -+ _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16); ++ _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0)))); + + r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1); + g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1); @@ -5870,9 +6305,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + y1obx8 = _mm256_srai_epi32(y1obx8, out_sh); + y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off)); + -+ y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8); ++ y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8); + y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0)); -+ _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16); ++ _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0)))); + + ravgx8 = _mm256_hadd_epi32(roax8, robx8); + ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8)); @@ -5897,18 +6332,20 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv))); + uox8 = _mm256_srai_epi32(uox8, out_sh); + uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset)); -+ uox8 = _mm256_packus_epi32(uox8, _mm256_setzero_si256()); ++ uox8 = _mm256_packs_epi32(uox8, _mm256_setzero_si256()); + uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0)); -+ _mm_storeu_si128((__m128i_u *) &dstu[x >> 1], _mm256_castsi256_si128(uox8)); ++ uox8 = _mm256_packus_epi16(uox8, _mm256_setzero_si256()); ++ _mm_storeu_si64(&dstu[x >> 1], _mm256_castsi256_si128(uox8)); + + vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv))); + vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv))); + vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv))); + vox8 = _mm256_srai_epi32(vox8, out_sh); + vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset)); -+ vox8 = _mm256_packus_epi32(vox8, _mm256_setzero_si256()); ++ vox8 = _mm256_packs_epi32(vox8, _mm256_setzero_si256()); + vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0)); -+ _mm_storeu_si128((__m128i_u *) &dstv[x >> 1], _mm256_castsi256_si128(vox8)); ++ vox8 = _mm256_packus_epi16(vox8, _mm256_setzero_si256()); ++ _mm_storeu_si64(&dstv[x >> 1], _mm256_castsi256_si128(vox8)); + } + } + @@ -5921,29 +6358,31 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + rsrcy += offset; + rsrcu += offset >> 1; + rsrcv += offset >> 1; -+ tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv, -+ rsrcy, rsrcu, rsrcv, -+ dstlinesize, srclinesize, -+ dstdepth, srcdepth, -+ remainw, rheight, params); ++ tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv, ++ rsrcy, rsrcu, rsrcv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); + } +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS +} + -+X86_64_V3 void tonemap_frame_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) ++X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ +#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS -+ uint8_t *rdsty = dsty; -+ uint8_t *rdstuv = dstuv; ++ uint16_t *rdsty = dsty; ++ uint16_t *rdstu = dstu; ++ uint16_t *rdstv = dstv; + const uint16_t *rsrcy = srcy; -+ const uint16_t *rsrcuv = srcuv; ++ const uint16_t *rsrcu = srcu; ++ const uint16_t *rsrcv = srcv; + int rheight = height; -+ // not zero when not divisible by 16 ++ // not zero when not divisible by 8 + // intentionally leave last pixel emtpy when input is odd + int remainw = width & 14; + @@ -5979,54 +6418,48 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + __m256i cyx8 = _mm256_set1_epi32(cy); + __m256i rndx8 = _mm256_set1_epi32(in_rnd); + -+ __m256i uvx16, uvx8a, uvx8b; -+ __m256i y0x16, y1x16; -+ __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b; -+ __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b; -+ __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b; -+ + __m256i r0ox16, g0ox16, b0ox16; + __m256i y0ox16; + __m256i roax8, robx8, goax8, gobx8, boax8, bobx8; + __m256i yoax8, yobx8; ++ __m256i ux8, vx8; ++ __m256i y0x16, y1x16; ++ __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b; ++ __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b; ++ __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b; + + __m256i r1ox16, g1ox16, b1ox16; + __m256i y1ox16; + __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8; -+ __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16; ++ __m256i y1oax8, y1obx8; + __m256i uox8, vox8, ravgx8, gavgx8, bavgx8; + for (; height > 1; height -= 2, -+ dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1], -+ srcy += srclinesize[0], srcuv += srclinesize[1] / 2) { ++ dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2, ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) { + for (int xx = 0; xx < width >> 4; xx++) { + int x = xx << 4; + + y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x)); + y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x))); -+ uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x)); -+ -+ // shift to low10bits for 10bit input -+ y0x16 = _mm256_srli_epi16(y0x16, TEN_BIT_BIPLANAR_SHIFT); -+ y1x16 = _mm256_srli_epi16(y1x16, TEN_BIT_BIPLANAR_SHIFT); -+ uvx16 = _mm256_srli_epi16(uvx16, TEN_BIT_BIPLANAR_SHIFT); ++ ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1)))); ++ vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1)))); + + y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0)); + y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1)); + y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0)); + y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1)); -+ uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0)); -+ uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1)); ++ + y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8); + y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8); + y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8); + y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8); -+ uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8); -+ uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8); ++ ux8 = _mm256_sub_epi32(ux8, in_uv_offx8); ++ vx8 = _mm256_sub_epi32(vx8, in_uv_offx8); + -+ ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0)); -+ ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0)); -+ vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1)); -+ vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1)); ++ ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); ++ ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); ++ vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); ++ vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); + + // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh); + r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8); @@ -6142,9 +6575,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + yobx8 = _mm256_srai_epi32(yobx8, out_sh); + yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off)); + -+ y0ox16 = _mm256_packs_epi32(yoax8, yobx8); ++ y0ox16 = _mm256_packus_epi32(yoax8, yobx8); + y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0)); -+ _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0)))); ++ _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16); + + r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1); + g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1); @@ -6172,9 +6605,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + y1obx8 = _mm256_srai_epi32(y1obx8, out_sh); + y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off)); + -+ y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8); ++ y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8); + y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0)); -+ _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0)))); ++ _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16); + + ravgx8 = _mm256_hadd_epi32(roax8, robx8); + ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8)); @@ -6199,17 +6632,18 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv))); + uox8 = _mm256_srai_epi32(uox8, out_sh); + uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset)); ++ uox8 = _mm256_packus_epi32(uox8, _mm256_setzero_si256()); ++ uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0)); ++ _mm_storeu_si128((__m128i_u *) &dstu[x >> 1], _mm256_castsi256_si128(uox8)); + + vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv))); + vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv))); + vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv))); + vox8 = _mm256_srai_epi32(vox8, out_sh); + vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset)); -+ -+ uvoax8 = _mm256_unpacklo_epi32(uox8, vox8); -+ uvobx8 = _mm256_unpackhi_epi32(uox8, vox8); -+ uvox16 = _mm256_packs_epi32(uvoax8, uvobx8); -+ _mm_storeu_si128((__m128i_u *) &dstuv[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(uvox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0)))); ++ vox8 = _mm256_packus_epi32(vox8, _mm256_setzero_si256()); ++ vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0)); ++ _mm_storeu_si128((__m128i_u *) &dstv[x >> 1], _mm256_castsi256_si128(vox8)); + } + } + @@ -6217,19 +6651,21 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + if (remainw) { + int offset = width & (int)0xfffffff0; + rdsty += offset; -+ rdstuv += offset; ++ rdstu += offset >> 1; ++ rdstv += offset >> 1; + rsrcy += offset; -+ rsrcuv += offset; -+ tonemap_frame_p010_2_nv12(rdsty, rdstuv, -+ rsrcy, rsrcuv, -+ dstlinesize, srclinesize, -+ dstdepth, srcdepth, -+ remainw, rheight, params); ++ rsrcu += offset >> 1; ++ rsrcv += offset >> 1; ++ tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv, ++ rsrcy, rsrcu, rsrcv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); + } +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS +} + -+X86_64_V3 void tonemap_frame_p010_2_p010_avx(uint16_t *dsty, uint16_t *dstuv, ++X86_64_V3 void tonemap_frame_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv, + const uint16_t *srcy, const uint16_t *srcuv, + const int *dstlinesize, const int *srclinesize, + int dstdepth, int srcdepth, @@ -6237,12 +6673,12 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + const struct TonemapIntParams *params) +{ +#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS -+ uint16_t *rdsty = dsty; -+ uint16_t *rdstuv = dstuv; ++ uint8_t *rdsty = dsty; ++ uint8_t *rdstuv = dstuv; + const uint16_t *rsrcy = srcy; + const uint16_t *rsrcuv = srcuv; + int rheight = height; -+ // not zero when not divisible by 8 ++ // not zero when not divisible by 16 + // intentionally leave last pixel emtpy when input is odd + int remainw = width & 14; + @@ -6255,7 +6691,6 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + const int out_uv_offset = 128 << (out_depth - 8); + const int out_sh = 29 - out_depth; + const int out_rnd = 1 << (out_sh - 1); -+ const int out_sh2 = 16 - out_depth; + + int cy = (*params->yuv2rgb_coeffs)[0][0][0]; + int crv = (*params->yuv2rgb_coeffs)[0][2][0]; @@ -6279,23 +6714,24 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + __m256i cyx8 = _mm256_set1_epi32(cy); + __m256i rndx8 = _mm256_set1_epi32(in_rnd); + -+ __m256i r0ox16, g0ox16, b0ox16; -+ __m256i y0ox16; -+ __m256i roax8, robx8, goax8, gobx8, boax8, bobx8; -+ __m256i yoax8, yobx8; + __m256i uvx16, uvx8a, uvx8b; + __m256i y0x16, y1x16; + __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b; + __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b; + __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b; + -+ __m256i r1ox16, g1ox16, b1ox16; ++ __m256i r0ox16, g0ox16, b0ox16; ++ __m256i y0ox16; ++ __m256i roax8, robx8, goax8, gobx8, boax8, bobx8; ++ __m256i yoax8, yobx8; ++ ++ __m256i r1ox16, g1ox16, b1ox16; + __m256i y1ox16; + __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8; + __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16; + __m256i uox8, vox8, ravgx8, gavgx8, bavgx8; + for (; height > 1; height -= 2, -+ dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2, ++ dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1], + srcy += srclinesize[0], srcuv += srclinesize[1] / 2) { + for (int xx = 0; xx < width >> 4; xx++) { + int x = xx << 4; @@ -6433,7 +6869,6 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd)); + yoax8 = _mm256_srai_epi32(yoax8, out_sh); + yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off)); -+ yoax8 = _mm256_slli_epi32(yoax8, out_sh2); + + yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry)); + yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy))); @@ -6441,11 +6876,10 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd)); + yobx8 = _mm256_srai_epi32(yobx8, out_sh); + yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off)); -+ yobx8 = _mm256_slli_epi32(yobx8, out_sh2); + -+ y0ox16 = _mm256_packus_epi32(yoax8, yobx8); ++ y0ox16 = _mm256_packs_epi32(yoax8, yobx8); + y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0)); -+ _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16); ++ _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0)))); + + r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1); + g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1); @@ -6465,7 +6899,6 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd)); + y1oax8 = _mm256_srai_epi32(y1oax8, out_sh); + y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off)); -+ y1oax8 = _mm256_slli_epi32(y1oax8, out_sh2); + + y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry)); + y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy))); @@ -6473,11 +6906,10 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd)); + y1obx8 = _mm256_srai_epi32(y1obx8, out_sh); + y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off)); -+ y1obx8 = _mm256_slli_epi32(y1obx8, out_sh2); + -+ y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8); ++ y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8); + y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0)); -+ _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16); ++ _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0)))); + + ravgx8 = _mm256_hadd_epi32(roax8, robx8); + ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8)); @@ -6511,10 +6943,8 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + + uvoax8 = _mm256_unpacklo_epi32(uox8, vox8); + uvobx8 = _mm256_unpackhi_epi32(uox8, vox8); -+ uvoax8 = _mm256_slli_epi32(uvoax8, out_sh2); -+ uvobx8 = _mm256_slli_epi32(uvobx8, out_sh2); -+ uvox16 = _mm256_packus_epi32(uvoax8, uvobx8); -+ _mm256_storeu_si256((__m256i_u *) &dstuv[x], uvox16); ++ uvox16 = _mm256_packs_epi32(uvoax8, uvobx8); ++ _mm_storeu_si128((__m128i_u *) &dstuv[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(uvox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0)))); + } + } + @@ -6525,7 +6955,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + rdstuv += offset; + rsrcy += offset; + rsrcuv += offset; -+ tonemap_frame_p010_2_p010(rdsty, rdstuv, ++ tonemap_frame_p010_2_nv12(rdsty, rdstuv, + rsrcy, rsrcuv, + dstlinesize, srclinesize, + dstdepth, srcdepth, @@ -6533,484 +6963,1197 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + } +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS +} -Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h -=================================================================== ---- /dev/null -+++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h -@@ -0,0 +1,68 @@ -+/* -+ * Copyright (c) 2024 Gnattu OC -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#ifndef AVFILTER_X86_TONEMAPX_INTRIN_AVX_H -+#define AVFILTER_X86_TONEMAPX_INTRIN_AVX_H -+ -+#include "libavfilter/vf_tonemapx.h" -+ -+X86_64_V3 void tonemap_frame_dovi_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, -+ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); -+ -+X86_64_V3 void tonemap_frame_dovi_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, -+ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); -+ -+X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, -+ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); -+ -+X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, -+ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); -+ -+X86_64_V3 void tonemap_frame_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); + +X86_64_V3 void tonemap_frame_p010_2_p010_avx(uint16_t *dsty, uint16_t *dstuv, + const uint16_t *srcy, const uint16_t *srcuv, + const int *dstlinesize, const int *srclinesize, + int dstdepth, int srcdepth, + int width, int height, -+ const struct TonemapIntParams *params); ++ const struct TonemapIntParams *params) ++{ ++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS ++ uint16_t *rdsty = dsty; ++ uint16_t *rdstuv = dstuv; ++ const uint16_t *rsrcy = srcy; ++ const uint16_t *rsrcuv = srcuv; ++ int rheight = height; ++ // not zero when not divisible by 8 ++ // intentionally leave last pixel emtpy when input is odd ++ int remainw = width & 14; + -+#endif // AVFILTER_X86_TONEMAPX_INTRIN_AVX_H -Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c -=================================================================== ---- /dev/null -+++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c -@@ -0,0 +1,2370 @@ -+/* -+ * Copyright (c) 2024 Gnattu OC -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ ++ const int in_depth = srcdepth; ++ const int in_uv_offset = 128 << (in_depth - 8); ++ const int in_sh = in_depth - 1; ++ const int in_rnd = 1 << (in_sh - 1); + -+#include "vf_tonemapx_intrin_sse.h" ++ const int out_depth = dstdepth; ++ const int out_uv_offset = 128 << (out_depth - 8); ++ const int out_sh = 29 - out_depth; ++ const int out_rnd = 1 << (out_sh - 1); ++ const int out_sh2 = 16 - out_depth; + -+#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS -+# include -+#endif // ENABLE_TONEMAPX_SSE_INTRINSICS ++ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; ++ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; ++ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; ++ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; ++ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; + -+#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS -+// GCC 10 and below does not implement _mm_storeu_si32 with movd instruction -+// cast the register into float register and store with movss as a workaround -+#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10) -+__attribute__((always_inline)) -+X86_64_V2 static inline void _mm_storeu_si32(void* mem_addr, __m128i a) { -+ _mm_store_ss((float*)mem_addr, _mm_castsi128_ps(a)); -+ return; -+} -+#endif ++ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; ++ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; ++ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; ++ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; ++ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; ++ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; ++ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; ++ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; + -+X86_64_V2 static inline __m128i av_clip_uint16_sse(__m128i a) -+{ -+ __m128i mask = _mm_set1_epi32(0x7FFF); -+ __m128i condition = _mm_and_si128(a, _mm_set1_epi32(~0x7FFF)); ++ int16_t r[16], g[16], b[16]; ++ int16_t r1[16], g1[16], b1[16]; ++ __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off); ++ __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset); ++ __m256i cyx8 = _mm256_set1_epi32(cy); ++ __m256i rndx8 = _mm256_set1_epi32(in_rnd); + -+ __m128i zero = _mm_setzero_si128(); -+ __m128i cmp = _mm_cmpeq_epi32(condition, zero); ++ __m256i r0ox16, g0ox16, b0ox16; ++ __m256i y0ox16; ++ __m256i roax8, robx8, goax8, gobx8, boax8, bobx8; ++ __m256i yoax8, yobx8; ++ __m256i uvx16, uvx8a, uvx8b; ++ __m256i y0x16, y1x16; ++ __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b; ++ __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b; ++ __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b; + -+ __m128i neg_a = _mm_and_si128(_mm_srai_epi32(_mm_xor_si128(a, _mm_set1_epi32(-1)), 31), mask); -+ __m128i result = _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, neg_a)); ++ __m256i r1ox16, g1ox16, b1ox16; ++ __m256i y1ox16; ++ __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8; ++ __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16; ++ __m256i uox8, vox8, ravgx8, gavgx8, bavgx8; ++ for (; height > 1; height -= 2, ++ dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2, ++ srcy += srclinesize[0], srcuv += srclinesize[1] / 2) { ++ for (int xx = 0; xx < width >> 4; xx++) { ++ int x = xx << 4; + -+ return result; -+} ++ y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x)); ++ y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x))); ++ uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x)); + -+X86_64_V2 static inline __m128i av_clip_int16_sse(__m128i a) -+{ -+ __m128i add_result = _mm_add_epi32(a, _mm_set1_epi32(0x8000U)); -+ __m128i mask = _mm_set1_epi32(~0xFFFF); -+ __m128i condition = _mm_and_si128(add_result, mask); -+ __m128i cmp = _mm_cmpeq_epi32(condition, _mm_setzero_si128()); ++ // shift to low10bits for 10bit input ++ y0x16 = _mm256_srli_epi16(y0x16, TEN_BIT_BIPLANAR_SHIFT); ++ y1x16 = _mm256_srli_epi16(y1x16, TEN_BIT_BIPLANAR_SHIFT); ++ uvx16 = _mm256_srli_epi16(uvx16, TEN_BIT_BIPLANAR_SHIFT); + -+ __m128i shifted = _mm_srai_epi32(a, 31); -+ __m128i xor_result = _mm_xor_si128(shifted, _mm_set1_epi32(0x7FFF)); ++ y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0)); ++ y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1)); ++ y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0)); ++ y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1)); ++ uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0)); ++ uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1)); ++ y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8); ++ y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8); ++ y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8); ++ y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8); ++ uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8); ++ uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8); + -+ return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, xor_result)); -+} ++ ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0)); ++ ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0)); ++ vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1)); ++ vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1)); + -+X86_64_V2 inline static __m128 mix_float32x4(__m128 x, __m128 y, __m128 a) -+{ -+ __m128 n = _mm_sub_ps(y, x); -+ n = _mm_mul_ps(n, a); -+ n = _mm_add_ps(n, x); -+ return n; -+} ++ // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh); ++ r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8); ++ r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv))); ++ r0x8a = _mm256_add_epi32(r0x8a, rndx8); ++ r0x8a = _mm256_srai_epi32(r0x8a, in_sh); ++ r0x8a = av_clip_int16_avx(r0x8a); + -+X86_64_V2 inline static float reduce_floatx4(__m128 x) { -+ x = _mm_hadd_ps(x, x); -+ x = _mm_hadd_ps(x, x); -+ return _mm_cvtss_f32(x); -+} ++ r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8); ++ r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv))); ++ r1x8a = _mm256_add_epi32(r1x8a, rndx8); ++ r1x8a = _mm256_srai_epi32(r1x8a, in_sh); ++ r1x8a = av_clip_int16_avx(r1x8a); + -+X86_64_V2 static inline float reshape_poly(float s, __m128 coeffs) -+{ -+ __m128 ps = _mm_set_ps(0.0f, s * s, s, 1.0f); -+ ps = _mm_mul_ps(ps, coeffs); -+ return reduce_floatx4(ps); -+} ++ // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu))); ++ g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv))); ++ g0x8a = _mm256_add_epi32(g0x8a, rndx8); ++ g0x8a = _mm256_srai_epi32(g0x8a, in_sh); ++ g0x8a = av_clip_int16_avx(g0x8a); + -+X86_64_V2 inline static float reshape_mmr(__m128 sig, __m128 coeffs, const float* mmr, -+ int mmr_single, int min_order, int max_order) -+{ -+ float s = _mm_cvtss_f32(coeffs); -+ int mmr_idx = 0; -+ int order = 0; ++ g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu))); ++ g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv))); ++ g1x8a = _mm256_add_epi32(g1x8a, rndx8); ++ g1x8a = _mm256_srai_epi32(g1x8a, in_sh); ++ g1x8a = av_clip_int16_avx(g1x8a); + -+ __m128 mmr_coeffs, ps; -+ __m128 sigX01 = _mm_mul_ps(sig, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(1, 1, 1, 1))); // {sig[0]*sig[1], sig[1]*sig[1], sig[2]*sig[1], sig[3]*sig[1]} -+ __m128 sigX02 = _mm_mul_ps(sig, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); // {sig[0]*sig[2], sig[1]*sig[2], sig[2]*sig[2], sig[3]*sig[2]} -+ __m128 sigX12 = _mm_mul_ps(sigX01, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); // {sig[0]*sig[1]*sig[2], sig[1]*sig[1]*sig[2], sig[2]*sig[1]*sig[2], sig[3]*sig[1]*sig[2]} -+ __m128 sigX = sigX01; // sig[0]*sig[1] now positioned at 0 ++ // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh); ++ b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu))); ++ b0x8a = _mm256_add_epi32(b0x8a, rndx8); ++ b0x8a = _mm256_srai_epi32(b0x8a, in_sh); ++ b0x8a = av_clip_int16_avx(b0x8a); + -+ sigX = _mm_insert_ps(sigX, sigX02, _MM_MK_INSERTPS_NDX(0, 1, 0)); // sig[0]*sig[2] at 1 -+ sigX = _mm_insert_ps(sigX, sigX02, _MM_MK_INSERTPS_NDX(1, 2, 0)); // sig[1]*sig[2] at 2 -+ sigX = _mm_insert_ps(sigX, sigX12, _MM_MK_INSERTPS_NDX(0, 3, 0)); // sig[0]*sig[1]*sig[2] at 3 ++ b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu))); ++ b1x8a = _mm256_add_epi32(b1x8a, rndx8); ++ b1x8a = _mm256_srai_epi32(b1x8a, in_sh); ++ b1x8a = av_clip_int16_avx(b1x8a); + -+ mmr_idx = mmr_single ? 0 : (int)_mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 2, 0, 1))); -+ order = (int)_mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(1, 2, 0, 3))); ++ r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8); ++ r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv))); ++ r0x8b = _mm256_add_epi32(r0x8b, rndx8); ++ r0x8b = _mm256_srai_epi32(r0x8b, in_sh); ++ r0x8b = av_clip_int16_avx(r0x8b); + -+ // dot first order -+ mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 0*4]); -+ ps = _mm_mul_ps(sig, mmr_coeffs); -+ s += reduce_floatx4(ps); -+ mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 1*4]); -+ ps = _mm_mul_ps(sigX, mmr_coeffs); -+ s += reduce_floatx4(ps); ++ r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8); ++ r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv))); ++ r1x8b = _mm256_add_epi32(r1x8b, rndx8); ++ r1x8b = _mm256_srai_epi32(r1x8b, in_sh); ++ r1x8b = av_clip_int16_avx(r1x8b); + -+ if (max_order >= 2 && (min_order >= 2 || order >= 2)) { -+ __m128 sig2 = _mm_mul_ps(sig, sig); -+ __m128 sigX2 = _mm_mul_ps(sigX, sigX); ++ g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu))); ++ g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv))); ++ g0x8b = _mm256_add_epi32(g0x8b, rndx8); ++ g0x8b = _mm256_srai_epi32(g0x8b, in_sh); ++ g0x8b = av_clip_int16_avx(g0x8b); + -+ mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 2*4]); -+ ps = _mm_mul_ps(sig2, mmr_coeffs); -+ s += reduce_floatx4(ps); -+ mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 3*4]); -+ ps = _mm_mul_ps(sigX2, mmr_coeffs); -+ s += reduce_floatx4(ps); ++ g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu))); ++ g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv))); ++ g1x8b = _mm256_add_epi32(g1x8b, rndx8); ++ g1x8b = _mm256_srai_epi32(g1x8b, in_sh); ++ g1x8b = av_clip_int16_avx(g1x8b); + -+ if (max_order == 3 && (min_order == 3 || order >= 3)) { -+ __m128 sig3 = _mm_mul_ps(sig2, sig); -+ __m128 sigX3 = _mm_mul_ps(sigX2, sigX); ++ b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu))); ++ b0x8b = _mm256_add_epi32(b0x8b, rndx8); ++ b0x8b = _mm256_srai_epi32(b0x8b, in_sh); ++ b0x8b = av_clip_int16_avx(b0x8b); + -+ mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 4*4]); -+ ps = _mm_mul_ps(sig3, mmr_coeffs); -+ s += reduce_floatx4(ps); -+ mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 5*4]); -+ ps = _mm_mul_ps(sigX3, mmr_coeffs); -+ s += reduce_floatx4(ps); ++ b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu))); ++ b1x8b = _mm256_add_epi32(b1x8b, rndx8); ++ b1x8b = _mm256_srai_epi32(b1x8b, in_sh); ++ b1x8b = av_clip_int16_avx(b1x8b); ++ ++ tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ ++ r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r); ++ g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g); ++ b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b); ++ ++ roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0)); ++ goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0)); ++ boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0)); ++ ++ robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1)); ++ gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1)); ++ bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1)); ++ ++ yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry)); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy))); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby))); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd)); ++ yoax8 = _mm256_srai_epi32(yoax8, out_sh); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off)); ++ yoax8 = _mm256_slli_epi32(yoax8, out_sh2); ++ ++ yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry)); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy))); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby))); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd)); ++ yobx8 = _mm256_srai_epi32(yobx8, out_sh); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off)); ++ yobx8 = _mm256_slli_epi32(yobx8, out_sh2); ++ ++ y0ox16 = _mm256_packus_epi32(yoax8, yobx8); ++ y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0)); ++ _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16); ++ ++ r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1); ++ g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1); ++ b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1); ++ ++ r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0)); ++ g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0)); ++ b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0)); ++ ++ r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1)); ++ g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1)); ++ b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1)); ++ ++ y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry)); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy))); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby))); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd)); ++ y1oax8 = _mm256_srai_epi32(y1oax8, out_sh); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off)); ++ y1oax8 = _mm256_slli_epi32(y1oax8, out_sh2); ++ ++ y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry)); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy))); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby))); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd)); ++ y1obx8 = _mm256_srai_epi32(y1obx8, out_sh); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off)); ++ y1obx8 = _mm256_slli_epi32(y1obx8, out_sh2); ++ ++ y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8); ++ y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0)); ++ _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16); ++ ++ ravgx8 = _mm256_hadd_epi32(roax8, robx8); ++ ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8)); ++ ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0)); ++ ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2)); ++ ravgx8 = _mm256_srai_epi32(ravgx8, 2); ++ ++ gavgx8 = _mm256_hadd_epi32(goax8, gobx8); ++ gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8)); ++ gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0)); ++ gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2)); ++ gavgx8 = _mm256_srai_epi32(gavgx8, 2); ++ ++ bavgx8 = _mm256_hadd_epi32(boax8, bobx8); ++ bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8)); ++ bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0)); ++ bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2)); ++ bavgx8 = _mm256_srai_epi32(bavgx8, 2); ++ ++ uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru))); ++ uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu))); ++ uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv))); ++ uox8 = _mm256_srai_epi32(uox8, out_sh); ++ uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset)); ++ ++ vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv))); ++ vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv))); ++ vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv))); ++ vox8 = _mm256_srai_epi32(vox8, out_sh); ++ vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset)); ++ ++ uvoax8 = _mm256_unpacklo_epi32(uox8, vox8); ++ uvobx8 = _mm256_unpackhi_epi32(uox8, vox8); ++ uvoax8 = _mm256_slli_epi32(uvoax8, out_sh2); ++ uvobx8 = _mm256_slli_epi32(uvobx8, out_sh2); ++ uvox16 = _mm256_packus_epi32(uvoax8, uvobx8); ++ _mm256_storeu_si256((__m256i_u *) &dstuv[x], uvox16); + } + } + -+ return s; ++ // Process remaining pixels cannot fill the full simd register with scalar version ++ if (remainw) { ++ int offset = width & (int)0xfffffff0; ++ rdsty += offset; ++ rdstuv += offset; ++ rsrcy += offset; ++ rsrcuv += offset; ++ tonemap_frame_p010_2_p010(rdsty, rdstuv, ++ rsrcy, rsrcuv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); ++ } ++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS +} +Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h +=================================================================== +--- /dev/null ++++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h +@@ -0,0 +1,75 @@ ++/* ++ * Copyright (c) 2024 Gnattu OC ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ + -+#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c))) -+X86_64_V2 inline static __m128 reshape_dovi_iptpqc2(__m128 sig, const TonemapIntParams *ctx) -+{ -+ int has_mmr_poly; -+ float s; ++#ifndef AVFILTER_X86_TONEMAPX_INTRIN_AVX_H ++#define AVFILTER_X86_TONEMAPX_INTRIN_AVX_H ++ ++#include "libavfilter/vf_tonemapx.h" ++ ++X86_64_V3 void tonemap_frame_dovi_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ ++X86_64_V3 void tonemap_frame_dovi_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ ++X86_64_V3 void tonemap_frame_dovi_2_420hdr_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ ++X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ ++X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ ++X86_64_V3 void tonemap_frame_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ ++X86_64_V3 void tonemap_frame_p010_2_p010_avx(uint16_t *dsty, uint16_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ ++#endif // AVFILTER_X86_TONEMAPX_INTRIN_AVX_H +Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c +=================================================================== +--- /dev/null ++++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c +@@ -0,0 +1,2740 @@ ++/* ++ * Copyright (c) 2024 Gnattu OC ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "vf_tonemapx_intrin_sse.h" ++ ++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS ++# include ++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS ++ ++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS ++// GCC 10 and below does not implement _mm_storeu_si32 with movd instruction ++// cast the register into float register and store with movss as a workaround ++#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10) ++__attribute__((always_inline)) ++X86_64_V2 static inline void _mm_storeu_si32(void* mem_addr, __m128i a) { ++ _mm_store_ss((float*)mem_addr, _mm_castsi128_ps(a)); ++ return; ++} ++#endif ++ ++X86_64_V2 static inline __m128i av_clip_uint16_sse(__m128i a) ++{ ++ __m128i mask = _mm_set1_epi32(0x7FFF); ++ __m128i condition = _mm_and_si128(a, _mm_set1_epi32(~0x7FFF)); ++ ++ __m128i zero = _mm_setzero_si128(); ++ __m128i cmp = _mm_cmpeq_epi32(condition, zero); ++ ++ __m128i neg_a = _mm_and_si128(_mm_srai_epi32(_mm_xor_si128(a, _mm_set1_epi32(-1)), 31), mask); ++ __m128i result = _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, neg_a)); ++ ++ return result; ++} ++ ++X86_64_V2 static inline __m128i av_clip_int16_sse(__m128i a) ++{ ++ __m128i add_result = _mm_add_epi32(a, _mm_set1_epi32(0x8000U)); ++ __m128i mask = _mm_set1_epi32(~0xFFFF); ++ __m128i condition = _mm_and_si128(add_result, mask); ++ __m128i cmp = _mm_cmpeq_epi32(condition, _mm_setzero_si128()); ++ ++ __m128i shifted = _mm_srai_epi32(a, 31); ++ __m128i xor_result = _mm_xor_si128(shifted, _mm_set1_epi32(0x7FFF)); ++ ++ return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, xor_result)); ++} ++ ++X86_64_V2 inline static __m128 mix_float32x4(__m128 x, __m128 y, __m128 a) ++{ ++ __m128 n = _mm_sub_ps(y, x); ++ n = _mm_mul_ps(n, a); ++ n = _mm_add_ps(n, x); ++ return n; ++} ++ ++X86_64_V2 inline static float reduce_floatx4(__m128 x) { ++ x = _mm_hadd_ps(x, x); ++ x = _mm_hadd_ps(x, x); ++ return _mm_cvtss_f32(x); ++} ++ ++X86_64_V2 static inline float reshape_poly(float s, __m128 coeffs) ++{ ++ __m128 ps = _mm_set_ps(0.0f, s * s, s, 1.0f); ++ ps = _mm_mul_ps(ps, coeffs); ++ return reduce_floatx4(ps); ++} ++ ++X86_64_V2 inline static float reshape_mmr(__m128 sig, __m128 coeffs, const float* mmr, ++ int mmr_single, int min_order, int max_order) ++{ ++ float s = _mm_cvtss_f32(coeffs); ++ int mmr_idx = 0; ++ int order = 0; ++ ++ __m128 mmr_coeffs, ps; ++ __m128 sigX01 = _mm_mul_ps(sig, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(1, 1, 1, 1))); // {sig[0]*sig[1], sig[1]*sig[1], sig[2]*sig[1], sig[3]*sig[1]} ++ __m128 sigX02 = _mm_mul_ps(sig, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); // {sig[0]*sig[2], sig[1]*sig[2], sig[2]*sig[2], sig[3]*sig[2]} ++ __m128 sigX12 = _mm_mul_ps(sigX01, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); // {sig[0]*sig[1]*sig[2], sig[1]*sig[1]*sig[2], sig[2]*sig[1]*sig[2], sig[3]*sig[1]*sig[2]} ++ __m128 sigX = sigX01; // sig[0]*sig[1] now positioned at 0 ++ ++ sigX = _mm_insert_ps(sigX, sigX02, _MM_MK_INSERTPS_NDX(0, 1, 0)); // sig[0]*sig[2] at 1 ++ sigX = _mm_insert_ps(sigX, sigX02, _MM_MK_INSERTPS_NDX(1, 2, 0)); // sig[1]*sig[2] at 2 ++ sigX = _mm_insert_ps(sigX, sigX12, _MM_MK_INSERTPS_NDX(0, 3, 0)); // sig[0]*sig[1]*sig[2] at 3 ++ ++ mmr_idx = mmr_single ? 0 : (int)_mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 2, 0, 1))); ++ order = (int)_mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(1, 2, 0, 3))); ++ ++ // dot first order ++ mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 0*4]); ++ ps = _mm_mul_ps(sig, mmr_coeffs); ++ s += reduce_floatx4(ps); ++ mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 1*4]); ++ ps = _mm_mul_ps(sigX, mmr_coeffs); ++ s += reduce_floatx4(ps); ++ ++ if (max_order >= 2 && (min_order >= 2 || order >= 2)) { ++ __m128 sig2 = _mm_mul_ps(sig, sig); ++ __m128 sigX2 = _mm_mul_ps(sigX, sigX); ++ ++ mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 2*4]); ++ ps = _mm_mul_ps(sig2, mmr_coeffs); ++ s += reduce_floatx4(ps); ++ mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 3*4]); ++ ps = _mm_mul_ps(sigX2, mmr_coeffs); ++ s += reduce_floatx4(ps); ++ ++ if (max_order == 3 && (min_order == 3 || order >= 3)) { ++ __m128 sig3 = _mm_mul_ps(sig2, sig); ++ __m128 sigX3 = _mm_mul_ps(sigX2, sigX); ++ ++ mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 4*4]); ++ ps = _mm_mul_ps(sig3, mmr_coeffs); ++ s += reduce_floatx4(ps); ++ mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 5*4]); ++ ps = _mm_mul_ps(sigX3, mmr_coeffs); ++ s += reduce_floatx4(ps); ++ } ++ } ++ ++ return s; ++} ++ ++#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c))) ++X86_64_V2 inline static __m128 reshape_dovi_iptpqc2(__m128 sig, const TonemapIntParams *ctx) ++{ ++ int has_mmr_poly; ++ float s; + + float *src_dovi_params = ctx->dovi_pbuf; + float *src_dovi_pivots = ctx->dovi_pbuf + 24; + float *src_dovi_coeffs = ctx->dovi_pbuf + 48; //float4* + float *src_dovi_mmr = ctx->dovi_pbuf + 144; //float4* + -+ float* dovi_params_i = src_dovi_params + 0*8; -+ float* dovi_pivots_i = src_dovi_pivots + 0*8; -+ float* dovi_coeffs_i = src_dovi_coeffs + 0 * 8 * 4; //float4* -+ float* dovi_mmr_i = src_dovi_mmr + 0 * 48 * 4; //float4* -+ int dovi_num_pivots_i = dovi_params_i[0]; -+ int dovi_has_mmr_i = dovi_params_i[1]; -+ int dovi_has_poly_i = dovi_params_i[2]; -+ int dovi_mmr_single_i = dovi_params_i[3]; -+ int dovi_min_order_i = dovi_params_i[4]; -+ int dovi_max_order_i = dovi_params_i[5]; -+ float dovi_lo_i = dovi_params_i[6]; -+ float dovi_hi_i = dovi_params_i[7]; ++ float* dovi_params_i = src_dovi_params + 0*8; ++ float* dovi_pivots_i = src_dovi_pivots + 0*8; ++ float* dovi_coeffs_i = src_dovi_coeffs + 0 * 8 * 4; //float4* ++ float* dovi_mmr_i = src_dovi_mmr + 0 * 48 * 4; //float4* ++ int dovi_num_pivots_i = dovi_params_i[0]; ++ int dovi_has_mmr_i = dovi_params_i[1]; ++ int dovi_has_poly_i = dovi_params_i[2]; ++ int dovi_mmr_single_i = dovi_params_i[3]; ++ int dovi_min_order_i = dovi_params_i[4]; ++ int dovi_max_order_i = dovi_params_i[5]; ++ float dovi_lo_i = dovi_params_i[6]; ++ float dovi_hi_i = dovi_params_i[7]; ++ ++ float* dovi_params_p = src_dovi_params + 1*8; ++ float* dovi_coeffs_p = src_dovi_coeffs + 1*8 * 4; //float4* ++ float* dovi_mmr_p = src_dovi_mmr + 1*48 * 4; //float4* ++ int dovi_has_mmr_p = dovi_params_p[1]; ++ int dovi_has_poly_p = dovi_params_p[2]; ++ int dovi_mmr_single_p = dovi_params_p[3]; ++ int dovi_min_order_p = dovi_params_p[4]; ++ int dovi_max_order_p = dovi_params_p[5]; ++ float dovi_lo_p = dovi_params_p[6]; ++ float dovi_hi_p = dovi_params_p[7]; ++ ++ float* dovi_params_t = src_dovi_params + 2*8; ++ float* dovi_coeffs_t = src_dovi_coeffs + 2*8 * 4; //float4* ++ float* dovi_mmr_t = src_dovi_mmr + 2*48 * 4; //float4* ++ int dovi_has_mmr_t = dovi_params_t[1]; ++ int dovi_has_poly_t = dovi_params_t[2]; ++ int dovi_mmr_single_t = dovi_params_t[3]; ++ int dovi_min_order_t = dovi_params_t[4]; ++ int dovi_max_order_t = dovi_params_t[5]; ++ float dovi_lo_t = dovi_params_t[6]; ++ float dovi_hi_t = dovi_params_t[7]; ++ ++ __m128 coeffs, result; ++ ++ // reshape I ++ s = _mm_cvtss_f32(sig); ++ result = sig; ++ if (dovi_num_pivots_i > 2) { ++ __m128 m01 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i), _mm_loadu_ps(dovi_coeffs_i + 4), _mm_set1_ps(s >= dovi_pivots_i[0])); ++ __m128 m23 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 2*4), _mm_loadu_ps(dovi_coeffs_i + 3*4), _mm_set1_ps(s >= dovi_pivots_i[2])); ++ __m128 m0123 = mix_float32x4(m01, m23, _mm_set1_ps(s >= dovi_pivots_i[1])); ++ __m128 m45 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 4*4), _mm_loadu_ps(dovi_coeffs_i + 5*4), _mm_set1_ps(s >= dovi_pivots_i[4])); ++ __m128 m67 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 6*4), _mm_loadu_ps(dovi_coeffs_i + 7*4), _mm_set1_ps(s >= dovi_pivots_i[6])); ++ __m128 m4567 = mix_float32x4(m45, m67, _mm_set1_ps(s >= dovi_pivots_i[5])); ++ coeffs = mix_float32x4(m0123, m4567, _mm_set1_ps(s >= dovi_pivots_i[3])); ++ } else { ++ coeffs = _mm_loadu_ps(dovi_coeffs_i); ++ } ++ ++ has_mmr_poly = dovi_has_mmr_i && dovi_has_poly_i; ++ ++ if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_i)) ++ s = reshape_poly(s, coeffs); ++ else ++ s = reshape_mmr(result, coeffs, dovi_mmr_i, ++ dovi_mmr_single_i, dovi_min_order_i, dovi_max_order_i); ++ ++ result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_i, dovi_hi_i)), _MM_MK_INSERTPS_NDX(0, 0, 0)); ++ ++ // reshape P ++ s = _mm_cvtss_f32(_mm_shuffle_ps(sig, sig, _MM_SHUFFLE(1, 1, 1, 1))); ++ coeffs = _mm_loadu_ps(dovi_coeffs_p); ++ has_mmr_poly = dovi_has_mmr_p && dovi_has_poly_p; ++ ++ if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_p)) ++ s = reshape_poly(s, coeffs); ++ else ++ s = reshape_mmr(result, coeffs, dovi_mmr_p, ++ dovi_mmr_single_p, dovi_min_order_p, dovi_max_order_p); ++ ++ result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_p, dovi_hi_p)), _MM_MK_INSERTPS_NDX(0, 1, 0)); ++ ++ // reshape T ++ s = _mm_cvtss_f32(_mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); ++ coeffs = _mm_loadu_ps(dovi_coeffs_t); ++ has_mmr_poly = dovi_has_mmr_t && dovi_has_poly_t; ++ ++ if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_t)) ++ s = reshape_poly(s, coeffs); ++ else ++ s = reshape_mmr(result, coeffs, dovi_mmr_t, ++ dovi_mmr_single_t, dovi_min_order_t, dovi_max_order_t); ++ ++ result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_t, dovi_hi_t)), _MM_MK_INSERTPS_NDX(0, 2, 0)); ++ ++ return result; ++} ++ ++X86_64_V2 inline static void ycc2rgbx4(__m128* dy, __m128* dcb, __m128* dcr, ++ __m128 y, __m128 cb, __m128 cr, ++ const double nonlinear[3][3], const float ycc_offset[3]) ++{ ++ *dy = _mm_mul_ps(y, _mm_set1_ps((float)nonlinear[0][0])); ++ *dy = _mm_add_ps(*dy, _mm_mul_ps(cb, _mm_set1_ps((float)nonlinear[0][1]))); ++ *dy = _mm_add_ps(*dy, _mm_mul_ps(cr, _mm_set1_ps((float)nonlinear[0][2]))); ++ *dy = _mm_sub_ps(*dy, _mm_set1_ps(ycc_offset[0])); ++ ++ *dcb = _mm_mul_ps(y, _mm_set1_ps((float)nonlinear[1][0])); ++ *dcb = _mm_add_ps(*dcb, _mm_mul_ps(cb, _mm_set1_ps((float)nonlinear[1][1]))); ++ *dcb = _mm_add_ps(*dcb, _mm_mul_ps(cr, _mm_set1_ps((float)nonlinear[1][2]))); ++ *dcb = _mm_sub_ps(*dcb, _mm_set1_ps(ycc_offset[1])); ++ ++ *dcr = _mm_mul_ps(y, _mm_set1_ps((float)nonlinear[2][0])); ++ *dcr = _mm_add_ps(*dcr, _mm_mul_ps(cb, _mm_set1_ps((float)nonlinear[2][1]))); ++ *dcr = _mm_add_ps(*dcr, _mm_mul_ps(cr, _mm_set1_ps((float)nonlinear[2][2]))); ++ *dcr = _mm_sub_ps(*dcr, _mm_set1_ps(ycc_offset[2])); ++} ++ ++X86_64_V2 inline static void lms2rgbx4(__m128* dl, __m128* dm, __m128* ds, ++ __m128 l, __m128 m, __m128 s, ++ const double lms2rgb_matrix[3][3]) ++{ ++ *dl = _mm_mul_ps(l, _mm_set1_ps((float)lms2rgb_matrix[0][0])); ++ *dl = _mm_add_ps(*dl, _mm_mul_ps(m, _mm_set1_ps((float)lms2rgb_matrix[0][1]))); ++ *dl = _mm_add_ps(*dl, _mm_mul_ps(s, _mm_set1_ps((float)lms2rgb_matrix[0][2]))); ++ ++ *dm = _mm_mul_ps(l, _mm_set1_ps((float)lms2rgb_matrix[1][0])); ++ *dm = _mm_add_ps(*dm, _mm_mul_ps(m, _mm_set1_ps((float)lms2rgb_matrix[1][1]))); ++ *dm = _mm_add_ps(*dm, _mm_mul_ps(s, _mm_set1_ps((float)lms2rgb_matrix[1][2]))); ++ ++ *ds = _mm_mul_ps(l, _mm_set1_ps((float)lms2rgb_matrix[2][0])); ++ *ds = _mm_add_ps(*ds, _mm_mul_ps(m, _mm_set1_ps((float)lms2rgb_matrix[2][1]))); ++ *ds = _mm_add_ps(*ds, _mm_mul_ps(s, _mm_set1_ps((float)lms2rgb_matrix[2][2]))); ++} ++ ++X86_64_V2 static inline void tonemap_int32x4_sse(__m128i r_in, __m128i g_in, __m128i b_in, ++ int16_t *r_out, int16_t *g_out, int16_t *b_out, ++ float *lin_lut, float *tonemap_lut, uint16_t *delin_lut, ++ const AVLumaCoefficients *coeffs, ++ const AVLumaCoefficients *ocoeffs, double desat, ++ double (*rgb2rgb)[3][3], ++ int rgb2rgb_passthrough) ++{ ++ __m128i sig4; ++ __m128 mapvalx4, r_linx4, g_linx4, b_linx4; ++ __m128 offset = _mm_set1_ps(0.5f); ++ __m128 intermediate_upper_bound = _mm_set1_ps(JPEG_SCALE); ++ __m128i r, g, b, rx4, gx4, bx4; ++ ++ float mapval4[4], r_lin4[4], g_lin4[4], b_lin4[4]; ++ ++ r = av_clip_uint16_sse(r_in); ++ g = av_clip_uint16_sse(g_in); ++ b = av_clip_uint16_sse(b_in); ++ ++ sig4 = _mm_max_epi32(r, _mm_max_epi32(g, b)); ++ ++ // Cannot use loop here as the lane has to be compile-time constant ++#define LOAD_LUT(i) mapval4[i] = tonemap_lut[_mm_extract_epi32(sig4, i)]; \ ++r_lin4[i] = lin_lut[_mm_extract_epi32(r, i)]; \ ++g_lin4[i] = lin_lut[_mm_extract_epi32(g, i)]; \ ++b_lin4[i] = lin_lut[_mm_extract_epi32(b, i)]; ++ ++ LOAD_LUT(0) ++ LOAD_LUT(1) ++ LOAD_LUT(2) ++ LOAD_LUT(3) ++ ++#undef LOAD_LUT ++ ++ mapvalx4 = _mm_loadu_ps(mapval4); ++ r_linx4 = _mm_loadu_ps(r_lin4); ++ g_linx4 = _mm_loadu_ps(g_lin4); ++ b_linx4 = _mm_loadu_ps(b_lin4); ++ ++ if (!rgb2rgb_passthrough) { ++ r_linx4 = _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][0])); ++ r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][1]))); ++ r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][2]))); ++ ++ g_linx4 = _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][1])); ++ g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][0]))); ++ g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][2]))); ++ ++ b_linx4 = _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][2])); ++ b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][0]))); ++ b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][1]))); ++ } ++ ++ if (desat > 0) { ++ __m128 eps_x4 = _mm_set1_ps(FLOAT_EPS); ++ __m128 desat4 = _mm_set1_ps((float)desat); ++ __m128 luma4 = _mm_set1_ps(0); ++ __m128 overbright4; ++ ++ luma4 = _mm_add_ps(luma4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)av_q2d(coeffs->cr)))); ++ luma4 = _mm_add_ps(luma4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)av_q2d(coeffs->cg)))); ++ luma4 = _mm_add_ps(luma4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)av_q2d(coeffs->cb)))); ++ overbright4 = _mm_div_ps(_mm_max_ps(_mm_sub_ps(luma4, desat4), eps_x4), _mm_max_ps(luma4, eps_x4)); ++ r_linx4 = _mm_sub_ps(r_linx4, _mm_mul_ps(r_linx4, overbright4)); ++ r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(luma4, overbright4)); ++ g_linx4 = _mm_sub_ps(g_linx4, _mm_mul_ps(g_linx4, overbright4)); ++ g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(luma4, overbright4)); ++ b_linx4 = _mm_sub_ps(b_linx4, _mm_mul_ps(b_linx4, overbright4)); ++ b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(luma4, overbright4)); ++ } ++ ++ r_linx4 = _mm_mul_ps(r_linx4, mapvalx4); ++ g_linx4 = _mm_mul_ps(g_linx4, mapvalx4); ++ b_linx4 = _mm_mul_ps(b_linx4, mapvalx4); ++ ++ r_linx4 = _mm_mul_ps(r_linx4, intermediate_upper_bound); ++ r_linx4 = _mm_add_ps(r_linx4, offset); ++ ++ g_linx4 = _mm_mul_ps(g_linx4, intermediate_upper_bound); ++ g_linx4 = _mm_add_ps(g_linx4, offset); ++ ++ b_linx4 = _mm_mul_ps(b_linx4, intermediate_upper_bound); ++ b_linx4 = _mm_add_ps(b_linx4, offset); ++ ++ rx4 = _mm_cvttps_epi32(r_linx4); ++ rx4 = av_clip_uint16_sse(rx4); ++ gx4 = _mm_cvttps_epi32(g_linx4); ++ gx4 = av_clip_uint16_sse(gx4); ++ bx4 = _mm_cvttps_epi32(b_linx4); ++ bx4 = av_clip_uint16_sse(bx4); ++ ++#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm_extract_epi32(rx4, i)]; \ ++g_out[i] = delin_lut[_mm_extract_epi32(gx4, i)]; \ ++b_out[i] = delin_lut[_mm_extract_epi32(bx4, i)]; ++ ++ SAVE_COLOR(0) ++ SAVE_COLOR(1) ++ SAVE_COLOR(2) ++ SAVE_COLOR(3) ++ ++#undef SAVE_COLOR ++} ++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS ++ ++X86_64_V2 void tonemap_frame_dovi_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) ++{ ++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS ++ uint8_t *rdsty = dsty; ++ uint8_t *rdstu = dstu; ++ uint8_t *rdstv = dstv; ++ ++ const uint16_t *rsrcy = srcy; ++ const uint16_t *rsrcu = srcu; ++ const uint16_t *rsrcv = srcv; ++ ++ int rheight = height; ++ // not zero when not divisible by 8 ++ // intentionally leave last pixel emtpy when input is odd ++ int remainw = width & 6; ++ ++ const int in_depth = srcdepth; ++ const float in_rng = (float)((1 << in_depth) - 1); ++ ++ const int out_depth = dstdepth; ++ const int out_uv_offset = 128 << (out_depth - 8); ++ const int out_sh = 29 - out_depth; ++ const int out_rnd = 1 << (out_sh - 1); ++ ++ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; ++ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; ++ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; ++ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; ++ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; ++ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; ++ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; ++ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; ++ ++ int16_t r[8], g[8], b[8]; ++ int16_t r1[8], g1[8], b1[8]; ++ ++ __m128i zero128 = _mm_setzero_si128(); ++ __m128i ux4, vx4; ++ __m128i y0x8, y1x8; ++ __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b; ++ __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b; ++ __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b; ++ ++ __m128i r0ox8, g0ox8, b0ox8; ++ __m128i y0ox8; ++ __m128i roax4, robx4, goax4, gobx4, boax4, bobx4; ++ __m128i yoax4, yobx4; ++ ++ __m128i r1ox8, g1ox8, b1ox8; ++ __m128i y1ox8; ++ __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4; ++ __m128i y1oax4, y1obx4; ++ __m128i uox4, vox4, ravgx4, gavgx4, bavgx4; ++ ++ __m128 ipt0, ipt1, ipt2, ipt3; ++ __m128 ia1, ib1, ia2, ib2; ++ __m128 ix4, px4, tx4; ++ __m128 lx4, mx4, sx4; ++ __m128 rx4a, gx4a, bx4a, rx4b, gx4b, bx4b; ++ __m128 y0x4af, y0x4bf, y1x4af, y1x4bf, ux4af, ux4bf, vx4af, vx4bf; ++ for (; height > 1; height -= 2, ++ dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2], ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) { ++ for (int xx = 0; xx < width >> 3; xx++) { ++ int x = xx << 3; ++ ++ y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x)); ++ y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x))); ++ ux4 = _mm_loadu_si64((__m128i*)(srcu + (x >> 1))); ++ vx4 = _mm_loadu_si64((__m128i*)(srcv + (x >> 1))); ++ ++ y0x4a = _mm_cvtepu16_epi32(y0x8); ++ y0x4b = _mm_unpackhi_epi16(y0x8, zero128); ++ y1x4a = _mm_cvtepu16_epi32(y1x8); ++ y1x4b = _mm_unpackhi_epi16(y1x8, zero128); ++ ux4 = _mm_cvtepu16_epi32(ux4); ++ vx4 = _mm_cvtepu16_epi32(vx4); ++ ++ ux4a = _mm_unpacklo_epi32(ux4, ux4); ++ ux4b = _mm_unpackhi_epi32(ux4, ux4); ++ vx4a = _mm_unpacklo_epi32(vx4, vx4); ++ vx4b = _mm_unpackhi_epi32(vx4, vx4); ++ ++ y0x4af = _mm_cvtepi32_ps(y0x4a); ++ y0x4bf = _mm_cvtepi32_ps(y0x4b); ++ y1x4af = _mm_cvtepi32_ps(y1x4a); ++ y1x4bf = _mm_cvtepi32_ps(y1x4b); ++ ux4af = _mm_cvtepi32_ps(ux4a); ++ ux4bf = _mm_cvtepi32_ps(ux4b); ++ vx4af = _mm_cvtepi32_ps(vx4a); ++ vx4bf = _mm_cvtepi32_ps(vx4b); ++ ++ y0x4af = _mm_div_ps(y0x4af, _mm_set1_ps(in_rng)); ++ y0x4bf = _mm_div_ps(y0x4bf, _mm_set1_ps(in_rng)); ++ y1x4af = _mm_div_ps(y1x4af, _mm_set1_ps(in_rng)); ++ y1x4bf = _mm_div_ps(y1x4bf, _mm_set1_ps(in_rng)); ++ ux4af = _mm_div_ps(ux4af, _mm_set1_ps(in_rng)); ++ ux4bf = _mm_div_ps(ux4bf, _mm_set1_ps(in_rng)); ++ vx4af = _mm_div_ps(vx4af, _mm_set1_ps(in_rng)); ++ vx4bf = _mm_div_ps(vx4bf, _mm_set1_ps(in_rng)); ++ ++ // Reshape y0x4a ++ ia1 = _mm_unpacklo_ps(y0x4af, ux4af); ++ ia2 = _mm_unpackhi_ps(y0x4af, ux4af); ++ ib1 = _mm_unpacklo_ps(vx4af, _mm_setzero_ps()); ++ ib2 = _mm_unpackhi_ps(vx4af, _mm_setzero_ps()); ++ ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2)); ++ ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++ ia1 = _mm_unpacklo_ps(ipt0, ipt1); ++ ia2 = _mm_unpacklo_ps(ipt2, ipt3); ++ ib1 = _mm_unpackhi_ps(ipt0, ipt1); ++ ib2 = _mm_unpackhi_ps(ipt2, ipt3); ++ ++ ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0)); ++ px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(JPEG_SCALE)); ++ gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(JPEG_SCALE)); ++ bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(JPEG_SCALE)); ++ ++ r0x4a = _mm_cvtps_epi32(rx4a); ++ r0x4a = av_clip_int16_sse(r0x4a); ++ g0x4a = _mm_cvtps_epi32(gx4a); ++ g0x4a = av_clip_int16_sse(g0x4a); ++ b0x4a = _mm_cvtps_epi32(bx4a); ++ b0x4a = av_clip_int16_sse(b0x4a); ++ ++ // Reshape y1x4a ++ ia1 = _mm_unpacklo_ps(y1x4af, ux4af); ++ ia2 = _mm_unpackhi_ps(y1x4af, ux4af); ++ ib1 = _mm_unpacklo_ps(vx4af, _mm_setzero_ps()); ++ ib2 = _mm_unpackhi_ps(vx4af, _mm_setzero_ps()); ++ ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2)); ++ ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2)); + -+ float* dovi_params_p = src_dovi_params + 1*8; -+ float* dovi_coeffs_p = src_dovi_coeffs + 1*8 * 4; //float4* -+ float* dovi_mmr_p = src_dovi_mmr + 1*48 * 4; //float4* -+ int dovi_has_mmr_p = dovi_params_p[1]; -+ int dovi_has_poly_p = dovi_params_p[2]; -+ int dovi_mmr_single_p = dovi_params_p[3]; -+ int dovi_min_order_p = dovi_params_p[4]; -+ int dovi_max_order_p = dovi_params_p[5]; -+ float dovi_lo_p = dovi_params_p[6]; -+ float dovi_hi_p = dovi_params_p[7]; ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); + -+ float* dovi_params_t = src_dovi_params + 2*8; -+ float* dovi_coeffs_t = src_dovi_coeffs + 2*8 * 4; //float4* -+ float* dovi_mmr_t = src_dovi_mmr + 2*48 * 4; //float4* -+ int dovi_has_mmr_t = dovi_params_t[1]; -+ int dovi_has_poly_t = dovi_params_t[2]; -+ int dovi_mmr_single_t = dovi_params_t[3]; -+ int dovi_min_order_t = dovi_params_t[4]; -+ int dovi_max_order_t = dovi_params_t[5]; -+ float dovi_lo_t = dovi_params_t[6]; -+ float dovi_hi_t = dovi_params_t[7]; ++ ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0)); + -+ __m128 coeffs, result; ++ ia1 = _mm_unpacklo_ps(ipt0, ipt1); ++ ia2 = _mm_unpacklo_ps(ipt2, ipt3); ++ ib1 = _mm_unpackhi_ps(ipt0, ipt1); ++ ib2 = _mm_unpackhi_ps(ipt2, ipt3); + -+ // reshape I -+ s = _mm_cvtss_f32(sig); -+ result = sig; -+ if (dovi_num_pivots_i > 2) { -+ __m128 m01 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i), _mm_loadu_ps(dovi_coeffs_i + 4), _mm_set1_ps(s >= dovi_pivots_i[0])); -+ __m128 m23 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 2*4), _mm_loadu_ps(dovi_coeffs_i + 3*4), _mm_set1_ps(s >= dovi_pivots_i[2])); -+ __m128 m0123 = mix_float32x4(m01, m23, _mm_set1_ps(s >= dovi_pivots_i[1])); -+ __m128 m45 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 4*4), _mm_loadu_ps(dovi_coeffs_i + 5*4), _mm_set1_ps(s >= dovi_pivots_i[4])); -+ __m128 m67 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 6*4), _mm_loadu_ps(dovi_coeffs_i + 7*4), _mm_set1_ps(s >= dovi_pivots_i[6])); -+ __m128 m4567 = mix_float32x4(m45, m67, _mm_set1_ps(s >= dovi_pivots_i[5])); -+ coeffs = mix_float32x4(m0123, m4567, _mm_set1_ps(s >= dovi_pivots_i[3])); -+ } else { -+ coeffs = _mm_loadu_ps(dovi_coeffs_i); -+ } ++ ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0)); ++ px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2)); + -+ has_mmr_poly = dovi_has_mmr_i && dovi_has_poly_i; ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); + -+ if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_i)) -+ s = reshape_poly(s, coeffs); -+ else -+ s = reshape_mmr(result, coeffs, dovi_mmr_i, -+ dovi_mmr_single_i, dovi_min_order_i, dovi_max_order_i); ++ rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(JPEG_SCALE)); ++ gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(JPEG_SCALE)); ++ bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(JPEG_SCALE)); + -+ result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_i, dovi_hi_i)), _MM_MK_INSERTPS_NDX(0, 0, 0)); ++ r1x4a = _mm_cvtps_epi32(rx4a); ++ r1x4a = av_clip_int16_sse(r1x4a); ++ g1x4a = _mm_cvtps_epi32(gx4a); ++ g1x4a = av_clip_int16_sse(g1x4a); ++ b1x4a = _mm_cvtps_epi32(bx4a); ++ b1x4a = av_clip_int16_sse(b1x4a); + -+ // reshape P -+ s = _mm_cvtss_f32(_mm_shuffle_ps(sig, sig, _MM_SHUFFLE(1, 1, 1, 1))); -+ coeffs = _mm_loadu_ps(dovi_coeffs_p); -+ has_mmr_poly = dovi_has_mmr_p && dovi_has_poly_p; ++ // Reshape y0x4b ++ ia1 = _mm_unpacklo_ps(y0x4bf, ux4bf); ++ ia2 = _mm_unpackhi_ps(y0x4bf, ux4bf); ++ ib1 = _mm_unpacklo_ps(vx4bf, _mm_setzero_ps()); ++ ib2 = _mm_unpackhi_ps(vx4bf, _mm_setzero_ps()); ++ ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2)); ++ ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2)); + -+ if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_p)) -+ s = reshape_poly(s, coeffs); -+ else -+ s = reshape_mmr(result, coeffs, dovi_mmr_p, -+ dovi_mmr_single_p, dovi_min_order_p, dovi_max_order_p); ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); + -+ result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_p, dovi_hi_p)), _MM_MK_INSERTPS_NDX(0, 1, 0)); ++ ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0)); + -+ // reshape T -+ s = _mm_cvtss_f32(_mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); -+ coeffs = _mm_loadu_ps(dovi_coeffs_t); -+ has_mmr_poly = dovi_has_mmr_t && dovi_has_poly_t; ++ ia1 = _mm_unpacklo_ps(ipt0, ipt1); ++ ia2 = _mm_unpacklo_ps(ipt2, ipt3); ++ ib1 = _mm_unpackhi_ps(ipt0, ipt1); ++ ib2 = _mm_unpackhi_ps(ipt2, ipt3); + -+ if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_t)) -+ s = reshape_poly(s, coeffs); -+ else -+ s = reshape_mmr(result, coeffs, dovi_mmr_t, -+ dovi_mmr_single_t, dovi_min_order_t, dovi_max_order_t); ++ ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0)); ++ px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2)); + -+ result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_t, dovi_hi_t)), _MM_MK_INSERTPS_NDX(0, 2, 0)); ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); + -+ return result; -+} ++ rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(JPEG_SCALE)); ++ gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(JPEG_SCALE)); ++ bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(JPEG_SCALE)); + -+X86_64_V2 inline static void ycc2rgbx4(__m128* dy, __m128* dcb, __m128* dcr, -+ __m128 y, __m128 cb, __m128 cr, -+ const double nonlinear[3][3], const float ycc_offset[3]) -+{ -+ *dy = _mm_mul_ps(y, _mm_set1_ps((float)nonlinear[0][0])); -+ *dy = _mm_add_ps(*dy, _mm_mul_ps(cb, _mm_set1_ps((float)nonlinear[0][1]))); -+ *dy = _mm_add_ps(*dy, _mm_mul_ps(cr, _mm_set1_ps((float)nonlinear[0][2]))); -+ *dy = _mm_sub_ps(*dy, _mm_set1_ps(ycc_offset[0])); ++ r0x4b = _mm_cvtps_epi32(rx4b); ++ r0x4b = av_clip_int16_sse(r0x4b); ++ g0x4b = _mm_cvtps_epi32(gx4b); ++ g0x4b = av_clip_int16_sse(g0x4b); ++ b0x4b = _mm_cvtps_epi32(bx4b); ++ b0x4b = av_clip_int16_sse(b0x4b); + -+ *dcb = _mm_mul_ps(y, _mm_set1_ps((float)nonlinear[1][0])); -+ *dcb = _mm_add_ps(*dcb, _mm_mul_ps(cb, _mm_set1_ps((float)nonlinear[1][1]))); -+ *dcb = _mm_add_ps(*dcb, _mm_mul_ps(cr, _mm_set1_ps((float)nonlinear[1][2]))); -+ *dcb = _mm_sub_ps(*dcb, _mm_set1_ps(ycc_offset[1])); ++ // Reshape y1x4b ++ ia1 = _mm_unpacklo_ps(y1x4bf, ux4bf); ++ ia2 = _mm_unpackhi_ps(y1x4bf, ux4bf); ++ ib1 = _mm_unpacklo_ps(vx4bf, _mm_setzero_ps()); ++ ib2 = _mm_unpackhi_ps(vx4bf, _mm_setzero_ps()); ++ ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2)); ++ ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2)); + -+ *dcr = _mm_mul_ps(y, _mm_set1_ps((float)nonlinear[2][0])); -+ *dcr = _mm_add_ps(*dcr, _mm_mul_ps(cb, _mm_set1_ps((float)nonlinear[2][1]))); -+ *dcr = _mm_add_ps(*dcr, _mm_mul_ps(cr, _mm_set1_ps((float)nonlinear[2][2]))); -+ *dcr = _mm_sub_ps(*dcr, _mm_set1_ps(ycc_offset[2])); -+} ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); + -+X86_64_V2 inline static void lms2rgbx4(__m128* dl, __m128* dm, __m128* ds, -+ __m128 l, __m128 m, __m128 s, -+ const double lms2rgb_matrix[3][3]) -+{ -+ *dl = _mm_mul_ps(l, _mm_set1_ps((float)lms2rgb_matrix[0][0])); -+ *dl = _mm_add_ps(*dl, _mm_mul_ps(m, _mm_set1_ps((float)lms2rgb_matrix[0][1]))); -+ *dl = _mm_add_ps(*dl, _mm_mul_ps(s, _mm_set1_ps((float)lms2rgb_matrix[0][2]))); ++ ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0)); + -+ *dm = _mm_mul_ps(l, _mm_set1_ps((float)lms2rgb_matrix[1][0])); -+ *dm = _mm_add_ps(*dm, _mm_mul_ps(m, _mm_set1_ps((float)lms2rgb_matrix[1][1]))); -+ *dm = _mm_add_ps(*dm, _mm_mul_ps(s, _mm_set1_ps((float)lms2rgb_matrix[1][2]))); ++ ia1 = _mm_unpacklo_ps(ipt0, ipt1); ++ ia2 = _mm_unpacklo_ps(ipt2, ipt3); ++ ib1 = _mm_unpackhi_ps(ipt0, ipt1); ++ ib2 = _mm_unpackhi_ps(ipt2, ipt3); + -+ *ds = _mm_mul_ps(l, _mm_set1_ps((float)lms2rgb_matrix[2][0])); -+ *ds = _mm_add_ps(*ds, _mm_mul_ps(m, _mm_set1_ps((float)lms2rgb_matrix[2][1]))); -+ *ds = _mm_add_ps(*ds, _mm_mul_ps(s, _mm_set1_ps((float)lms2rgb_matrix[2][2]))); -+} ++ ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0)); ++ px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2)); + -+X86_64_V2 static inline void tonemap_int32x4_sse(__m128i r_in, __m128i g_in, __m128i b_in, -+ int16_t *r_out, int16_t *g_out, int16_t *b_out, -+ float *lin_lut, float *tonemap_lut, uint16_t *delin_lut, -+ const AVLumaCoefficients *coeffs, -+ const AVLumaCoefficients *ocoeffs, double desat, -+ double (*rgb2rgb)[3][3], -+ int rgb2rgb_passthrough) -+{ -+ __m128i sig4; -+ __m128 mapvalx4, r_linx4, g_linx4, b_linx4; -+ __m128 offset = _mm_set1_ps(0.5f); -+ __m128 intermediate_upper_bound = _mm_set1_ps(JPEG_SCALE); -+ __m128i r, g, b, rx4, gx4, bx4; ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); + -+ float mapval4[4], r_lin4[4], g_lin4[4], b_lin4[4]; ++ rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(JPEG_SCALE)); ++ gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(JPEG_SCALE)); ++ bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(JPEG_SCALE)); + -+ r = av_clip_uint16_sse(r_in); -+ g = av_clip_uint16_sse(g_in); -+ b = av_clip_uint16_sse(b_in); ++ r1x4b = _mm_cvtps_epi32(rx4b); ++ r1x4b = av_clip_int16_sse(r1x4b); ++ g1x4b = _mm_cvtps_epi32(gx4b); ++ g1x4b = av_clip_int16_sse(g1x4b); ++ b1x4b = _mm_cvtps_epi32(bx4b); ++ b1x4b = av_clip_int16_sse(b1x4b); + -+ sig4 = _mm_max_epi32(r, _mm_max_epi32(g, b)); ++ tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); + -+ // Cannot use loop here as the lane has to be compile-time constant -+#define LOAD_LUT(i) mapval4[i] = tonemap_lut[_mm_extract_epi32(sig4, i)]; \ -+r_lin4[i] = lin_lut[_mm_extract_epi32(r, i)]; \ -+g_lin4[i] = lin_lut[_mm_extract_epi32(g, i)]; \ -+b_lin4[i] = lin_lut[_mm_extract_epi32(b, i)]; ++ r0ox8 = _mm_lddqu_si128((const __m128i_u *)r); ++ g0ox8 = _mm_lddqu_si128((const __m128i_u *)g); ++ b0ox8 = _mm_lddqu_si128((const __m128i_u *)b); + -+ LOAD_LUT(0) -+ LOAD_LUT(1) -+ LOAD_LUT(2) -+ LOAD_LUT(3) ++ roax4 = _mm_cvtepi16_epi32(r0ox8); ++ goax4 = _mm_cvtepi16_epi32(g0ox8); ++ boax4 = _mm_cvtepi16_epi32(b0ox8); + -+#undef LOAD_LUT ++ robx4 = _mm_unpackhi_epi16(r0ox8, zero128); ++ gobx4 = _mm_unpackhi_epi16(g0ox8, zero128); ++ bobx4 = _mm_unpackhi_epi16(b0ox8, zero128); + -+ mapvalx4 = _mm_loadu_ps(mapval4); -+ r_linx4 = _mm_loadu_ps(r_lin4); -+ g_linx4 = _mm_loadu_ps(g_lin4); -+ b_linx4 = _mm_loadu_ps(b_lin4); ++ yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry)); ++ yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy))); ++ yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby))); ++ yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd)); ++ // output shift bits for 8bit outputs is 29 - 8 = 21 ++ yoax4 = _mm_srai_epi32(yoax4, 21); ++ yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off)); + -+ if (!rgb2rgb_passthrough) { -+ r_linx4 = _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][0])); -+ r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][1]))); -+ r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][2]))); ++ yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry)); ++ yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy))); ++ yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby))); ++ yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd)); ++ yobx4 = _mm_srai_epi32(yobx4, 21); ++ yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off)); + -+ g_linx4 = _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][1])); -+ g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][0]))); -+ g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][2]))); ++ y0ox8 = _mm_packs_epi32(yoax4, yobx4); ++ _mm_storeu_si64(&dsty[x], _mm_packus_epi16(y0ox8, zero128)); + -+ b_linx4 = _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][2])); -+ b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][0]))); -+ b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][1]))); -+ } ++ r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1); ++ g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1); ++ b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1); + -+ if (desat > 0) { -+ __m128 eps_x4 = _mm_set1_ps(FLOAT_EPS); -+ __m128 desat4 = _mm_set1_ps((float)desat); -+ __m128 luma4 = _mm_set1_ps(0); -+ __m128 overbright4; ++ r1oax4 = _mm_cvtepi16_epi32(r1ox8); ++ g1oax4 = _mm_cvtepi16_epi32(g1ox8); ++ b1oax4 = _mm_cvtepi16_epi32(b1ox8); + -+ luma4 = _mm_add_ps(luma4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)av_q2d(coeffs->cr)))); -+ luma4 = _mm_add_ps(luma4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)av_q2d(coeffs->cg)))); -+ luma4 = _mm_add_ps(luma4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)av_q2d(coeffs->cb)))); -+ overbright4 = _mm_div_ps(_mm_max_ps(_mm_sub_ps(luma4, desat4), eps_x4), _mm_max_ps(luma4, eps_x4)); -+ r_linx4 = _mm_sub_ps(r_linx4, _mm_mul_ps(r_linx4, overbright4)); -+ r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(luma4, overbright4)); -+ g_linx4 = _mm_sub_ps(g_linx4, _mm_mul_ps(g_linx4, overbright4)); -+ g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(luma4, overbright4)); -+ b_linx4 = _mm_sub_ps(b_linx4, _mm_mul_ps(b_linx4, overbright4)); -+ b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(luma4, overbright4)); -+ } ++ r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128); ++ g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128); ++ b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128); + -+ r_linx4 = _mm_mul_ps(r_linx4, mapvalx4); -+ g_linx4 = _mm_mul_ps(g_linx4, mapvalx4); -+ b_linx4 = _mm_mul_ps(b_linx4, mapvalx4); ++ y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry)); ++ y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy))); ++ y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby))); ++ y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd)); ++ y1oax4 = _mm_srai_epi32(y1oax4, 21); ++ y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off)); + -+ r_linx4 = _mm_mul_ps(r_linx4, intermediate_upper_bound); -+ r_linx4 = _mm_add_ps(r_linx4, offset); ++ y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry)); ++ y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy))); ++ y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby))); ++ y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd)); ++ y1obx4 = _mm_srai_epi32(y1obx4, 21); ++ y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off)); + -+ g_linx4 = _mm_mul_ps(g_linx4, intermediate_upper_bound); -+ g_linx4 = _mm_add_ps(g_linx4, offset); ++ y1ox8 = _mm_packs_epi32(y1oax4, y1obx4); ++ _mm_storeu_si64(&dsty[x + dstlinesize[0]], _mm_packus_epi16(y1ox8, zero128)); + -+ b_linx4 = _mm_mul_ps(b_linx4, intermediate_upper_bound); -+ b_linx4 = _mm_add_ps(b_linx4, offset); ++ ravgx4 = _mm_hadd_epi32(roax4, robx4); ++ ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4)); ++ ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2)); ++ ravgx4 = _mm_srai_epi32(ravgx4, 2); + -+ rx4 = _mm_cvttps_epi32(r_linx4); -+ rx4 = av_clip_uint16_sse(rx4); -+ gx4 = _mm_cvttps_epi32(g_linx4); -+ gx4 = av_clip_uint16_sse(gx4); -+ bx4 = _mm_cvttps_epi32(b_linx4); -+ bx4 = av_clip_uint16_sse(bx4); ++ gavgx4 = _mm_hadd_epi32(goax4, gobx4); ++ gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4)); ++ gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2)); ++ gavgx4 = _mm_srai_epi32(gavgx4, 2); + -+#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm_extract_epi32(rx4, i)]; \ -+g_out[i] = delin_lut[_mm_extract_epi32(gx4, i)]; \ -+b_out[i] = delin_lut[_mm_extract_epi32(bx4, i)]; ++ bavgx4 = _mm_hadd_epi32(boax4, bobx4); ++ bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4)); ++ bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2)); ++ bavgx4 = _mm_srai_epi32(bavgx4, 2); + -+ SAVE_COLOR(0) -+ SAVE_COLOR(1) -+ SAVE_COLOR(2) -+ SAVE_COLOR(3) ++ uox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru))); ++ uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu))); ++ uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv))); ++ uox4 = _mm_srai_epi32(uox4, 21); ++ uox4 = _mm_add_epi32(uox4, _mm_set1_epi32(out_uv_offset)); ++ _mm_storeu_si32(&dstu[x >> 1], _mm_packus_epi16(_mm_packs_epi32(uox4, zero128), zero128)); + -+#undef SAVE_COLOR -+} ++ vox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv))); ++ vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv))); ++ vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv))); ++ vox4 = _mm_srai_epi32(vox4, 21); ++ vox4 = _mm_add_epi32(vox4, _mm_set1_epi32(out_uv_offset)); ++ _mm_storeu_si32(&dstv[x >> 1], _mm_packus_epi16(_mm_packs_epi32(vox4, zero128), zero128)); ++ } ++ } ++ ++ // Process remaining pixels cannot fill the full simd register with scalar version ++ if (remainw) { ++ int offset = width & (int)0xfffffff8; ++ rdsty += offset; ++ rdstu += offset >> 1; ++ rdstv += offset >> 1; ++ rsrcy += offset; ++ rsrcu += offset >> 1; ++ rsrcv += offset >> 1; ++ tonemap_frame_dovi_2_420p(rdsty, rdstu, rdstv, ++ rsrcy, rsrcu, rsrcv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); ++ } +#endif // ENABLE_TONEMAPX_SSE_INTRINSICS ++} + -+X86_64_V2 void tonemap_frame_dovi_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, -+ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) ++X86_64_V2 void tonemap_frame_dovi_2_420p10_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ +#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS -+ uint8_t *rdsty = dsty; -+ uint8_t *rdstu = dstu; -+ uint8_t *rdstv = dstv; -+ ++ uint16_t *rdsty = dsty; ++ uint16_t *rdstu = dstu; ++ uint16_t *rdstv = dstv; + const uint16_t *rsrcy = srcy; + const uint16_t *rsrcu = srcu; + const uint16_t *rsrcv = srcv; -+ + int rheight = height; + // not zero when not divisible by 8 + // intentionally leave last pixel emtpy when input is odd @@ -7061,8 +8204,8 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + __m128 rx4a, gx4a, bx4a, rx4b, gx4b, bx4b; + __m128 y0x4af, y0x4bf, y1x4af, y1x4bf, ux4af, ux4bf, vx4af, vx4bf; + for (; height > 1; height -= 2, -+ dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2], -+ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) { ++ dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2, ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) { + for (int xx = 0; xx < width >> 3; xx++) { + int x = xx << 3; + @@ -7306,19 +8449,18 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy))); + yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby))); + yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd)); -+ // output shift bits for 8bit outputs is 29 - 8 = 21 -+ yoax4 = _mm_srai_epi32(yoax4, 21); ++ yoax4 = _mm_srai_epi32(yoax4, out_sh); + yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off)); + + yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry)); + yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy))); + yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby))); + yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd)); -+ yobx4 = _mm_srai_epi32(yobx4, 21); ++ yobx4 = _mm_srai_epi32(yobx4, out_sh); + yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off)); + -+ y0ox8 = _mm_packs_epi32(yoax4, yobx4); -+ _mm_storeu_si64(&dsty[x], _mm_packus_epi16(y0ox8, zero128)); ++ y0ox8 = _mm_packus_epi32(yoax4, yobx4); ++ _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8); + + r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1); + g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1); @@ -7336,18 +8478,18 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy))); + y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby))); + y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd)); -+ y1oax4 = _mm_srai_epi32(y1oax4, 21); ++ y1oax4 = _mm_srai_epi32(y1oax4, out_sh); + y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off)); + + y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry)); + y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy))); + y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby))); + y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd)); -+ y1obx4 = _mm_srai_epi32(y1obx4, 21); ++ y1obx4 = _mm_srai_epi32(y1obx4, out_sh); + y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off)); + -+ y1ox8 = _mm_packs_epi32(y1oax4, y1obx4); -+ _mm_storeu_si64(&dsty[x + dstlinesize[0]], _mm_packus_epi16(y1ox8, zero128)); ++ y1ox8 = _mm_packus_epi32(y1oax4, y1obx4); ++ _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0] / 2], y1ox8); + + ravgx4 = _mm_hadd_epi32(roax4, robx4); + ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4)); @@ -7367,16 +8509,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + uox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru))); + uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu))); + uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv))); -+ uox4 = _mm_srai_epi32(uox4, 21); ++ uox4 = _mm_srai_epi32(uox4, out_sh); + uox4 = _mm_add_epi32(uox4, _mm_set1_epi32(out_uv_offset)); -+ _mm_storeu_si32(&dstu[x >> 1], _mm_packus_epi16(_mm_packs_epi32(uox4, zero128), zero128)); ++ _mm_storeu_si64((__m128i_u *) &dstu[x >> 1], _mm_packus_epi32(uox4, zero128)); + + vox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv))); + vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv))); + vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv))); -+ vox4 = _mm_srai_epi32(vox4, 21); ++ vox4 = _mm_srai_epi32(vox4, out_sh); + vox4 = _mm_add_epi32(vox4, _mm_set1_epi32(out_uv_offset)); -+ _mm_storeu_si32(&dstv[x >> 1], _mm_packus_epi16(_mm_packs_epi32(vox4, zero128), zero128)); ++ _mm_storeu_si64((__m128i_u *) &dstv[x >> 1], _mm_packus_epi32(vox4, zero128)); + } + } + @@ -7389,16 +8531,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + rsrcy += offset; + rsrcu += offset >> 1; + rsrcv += offset >> 1; -+ tonemap_frame_dovi_2_420p(rdsty, rdstu, rdstv, -+ rsrcy, rsrcu, rsrcv, -+ dstlinesize, srclinesize, -+ dstdepth, srcdepth, -+ remainw, rheight, params); ++ tonemap_frame_dovi_2_420p10(rdsty, rdstu, rdstv, ++ rsrcy, rsrcu, rsrcv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); + } +#endif // ENABLE_TONEMAPX_SSE_INTRINSICS +} + -+X86_64_V2 void tonemap_frame_dovi_2_420p10_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++X86_64_V2 void tonemap_frame_dovi_2_420hdr_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, + const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, + const int *dstlinesize, const int *srclinesize, + int dstdepth, int srcdepth, @@ -7434,9 +8576,6 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; + int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; + -+ int16_t r[8], g[8], b[8]; -+ int16_t r1[8], g1[8], b1[8]; -+ + __m128i zero128 = _mm_setzero_si128(); + __m128i ux4, vx4; + __m128i y0x8, y1x8; @@ -7444,12 +8583,10 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b; + __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b; + -+ __m128i r0ox8, g0ox8, b0ox8; + __m128i y0ox8; + __m128i roax4, robx4, goax4, gobx4, boax4, bobx4; + __m128i yoax4, yobx4; + -+ __m128i r1ox8, g1ox8, b1ox8; + __m128i y1ox8; + __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4; + __m128i y1oax4, y1obx4; @@ -7674,34 +8811,13 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + b1x4b = _mm_cvtps_epi32(bx4b); + b1x4b = av_clip_int16_sse(b1x4b); + -+ tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b, -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, -+ params->rgb2rgb_passthrough); -+ tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1, -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, -+ params->rgb2rgb_passthrough); -+ tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, -+ params->rgb2rgb_passthrough); -+ tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, -+ params->rgb2rgb_passthrough); -+ -+ r0ox8 = _mm_lddqu_si128((const __m128i_u *)r); -+ g0ox8 = _mm_lddqu_si128((const __m128i_u *)g); -+ b0ox8 = _mm_lddqu_si128((const __m128i_u *)b); -+ -+ roax4 = _mm_cvtepi16_epi32(r0ox8); -+ goax4 = _mm_cvtepi16_epi32(g0ox8); -+ boax4 = _mm_cvtepi16_epi32(b0ox8); ++ roax4 = r0x4a; ++ goax4 = g0x4a; ++ boax4 = b0x4a; + -+ robx4 = _mm_unpackhi_epi16(r0ox8, zero128); -+ gobx4 = _mm_unpackhi_epi16(g0ox8, zero128); -+ bobx4 = _mm_unpackhi_epi16(b0ox8, zero128); ++ robx4 = r0x4b; ++ gobx4 = g0x4b; ++ bobx4 = b0x4b; + + yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry)); + yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy))); @@ -7720,17 +8836,13 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + y0ox8 = _mm_packus_epi32(yoax4, yobx4); + _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8); + -+ r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1); -+ g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1); -+ b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1); -+ -+ r1oax4 = _mm_cvtepi16_epi32(r1ox8); -+ g1oax4 = _mm_cvtepi16_epi32(g1ox8); -+ b1oax4 = _mm_cvtepi16_epi32(b1ox8); ++ r1oax4 = r1x4a; ++ g1oax4 = g1x4a; ++ b1oax4 = b1x4a; + -+ r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128); -+ g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128); -+ b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128); ++ r1obx4 = r1x4b; ++ g1obx4 = g1x4b; ++ b1obx4 = b1x4b; + + y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry)); + y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy))); @@ -7789,7 +8901,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + rsrcy += offset; + rsrcu += offset >> 1; + rsrcv += offset >> 1; -+ tonemap_frame_dovi_2_420p10(rdsty, rdstu, rdstv, ++ tonemap_frame_dovi_2_420hdr(rdsty, rdstu, rdstv, + rsrcy, rsrcu, rsrcv, + dstlinesize, srclinesize, + dstdepth, srcdepth, @@ -8985,7 +10097,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h =================================================================== --- /dev/null +++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h -@@ -0,0 +1,68 @@ +@@ -0,0 +1,75 @@ +/* + * Copyright (c) 2024 Gnattu OC + * @@ -9025,6 +10137,13 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h + int width, int height, + const struct TonemapIntParams *params); + ++X86_64_V2 void tonemap_frame_dovi_2_420hdr_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ +X86_64_V2 void tonemap_frame_420p10_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, + const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, + const int *dstlinesize, const int *srclinesize, From 9ccb771f7656a3db927c23ae06d3e6179799bd7e Mon Sep 17 00:00:00 2001 From: gnattu Date: Fri, 20 Dec 2024 12:53:42 +0800 Subject: [PATCH 2/3] avfilter/tonemapx: fix register type casting for neon --- .../0060-add-simd-optimized-tonemapx-filter.patch | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch index 10ad3f9ee6..1637b77bf2 100644 --- a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch +++ b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch @@ -1909,9 +1909,9 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + g1x8 = vminq_u16(g1x8, vdupq_n_u16(INT16_MAX)); + b1x8 = vminq_u16(b1x8, vdupq_n_u16(INT16_MAX)); + -+ r0ox8 = r0x8; -+ g0ox8 = g0x8; -+ b0ox8 = b0x8; ++ r0ox8 = vreinterpretq_s16_u16(r0x8); ++ g0ox8 = vreinterpretq_s16_u16(g0x8); ++ b0ox8 = vreinterpretq_s16_u16(b0x8); + + r0oax4 = vmovl_s16(vget_low_s16(r0ox8)); + g0oax4 = vmovl_s16(vget_low_s16(g0ox8)); @@ -1938,9 +1938,9 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4)); + vst1q_u16(&dsty[x], y0ox8); + -+ r1ox8 = r1x8; -+ g1ox8 = g1x8; -+ b1ox8 = b1x8; ++ r1ox8 = vreinterpretq_s16_u16(r1x8); ++ g1ox8 = vreinterpretq_s16_u16(g1x8); ++ b1ox8 = vreinterpretq_s16_u16(b1x8); + + r1oax4 = vmovl_s16(vget_low_s16(r1ox8)); + g1oax4 = vmovl_s16(vget_low_s16(g1ox8)); From 71adc796a8bc3fc4f76a5f4f4cfe3f7e5f89158b Mon Sep 17 00:00:00 2001 From: gnattu Date: Fri, 20 Dec 2024 22:42:51 +0800 Subject: [PATCH 3/3] avfilter/tonemapx: require 10bit output for hdr --- .../patches/0060-add-simd-optimized-tonemapx-filter.patch | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch index 1637b77bf2..7fb24da65c 100644 --- a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch +++ b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch @@ -2635,7 +2635,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c =================================================================== --- /dev/null +++ FFmpeg/libavfilter/vf_tonemapx.c -@@ -0,0 +1,1881 @@ +@@ -0,0 +1,1886 @@ +/* + * This file is part of FFmpeg. + * @@ -4081,6 +4081,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + return AVERROR_BUG; + } + ++ if (s->trc == AVCOL_TRC_SMPTE2084 && odesc->comp[0].depth == 8) { ++ av_log(s, AV_LOG_ERROR, "HDR passthrough requires 10 bit output format depth\n"); ++ av_assert0(0); ++ } ++ + switch (odesc->comp[2].plane) { + case 1: // biplanar + if (odesc->comp[0].depth == 8) {