avcodec/x86/huffyuvencdsp: Add SSE2 sub_hfyu_median_pred_int16

author Andreas Rheinhardt <andreas.rheinhardt@outlook.com>

Wed, 25 Feb 2026 23:43:09 +0000 (00:43 +0100)

committer Andreas Rheinhardt <andreas.rheinhardt@outlook.com>

Sun, 1 Mar 2026 11:03:55 +0000 (12:03 +0100)
author Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Wed, 25 Feb 2026 23:43:09 +0000 (00:43 +0100)
committer Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Sun, 1 Mar 2026 11:03:55 +0000 (12:03 +0100)
diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm

index 8bfd0face008fe8d2ee8deda025ab4790e36bca3..3d38931893380c04b67898944ac8ddafd9d865c1 100644 (file)
--- a/libavcodec/x86/huffyuvencdsp.asm
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@ -94,3 +94,53 @@ cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_
      movzx maskd, word [src2q + wq - 2]
      mov [leftq], maskd
      RET
+
+INIT_XMM sse2
+cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, src2, mask, w, left, left_top
+    movd         m5, maskd
+    lea          wd, [wd+wd-(mmsize-1)]
+    movu         m0, [src1q]
+    movu         m2, [src2q]
+    SPLATW       m5, m5
+    add        dstq, wq
+    movd         m1, [left_topq]
+    neg          wq
+    movd         m3, [leftq]
+    sub       src1q, wq
+    sub       src2q, wq
+    pslldq       m0, 2
+    pslldq       m2, 2
+    por          m0, m1
+    por          m2, m3
+    jmp       .init
+
+.loop:
+    movu         m0, [src1q + wq - 2]   ; lt
+    movu         m2, [src2q + wq - 2]   ; l
+.init:
+    movu         m1, [src1q + wq]       ; t
+    movu         m3, [src2q + wq]
+    psubw        m4, m2, m0             ; l - lt
+    pmaxsw       m0, m1, m2
+    paddw        m4, m1                 ; l - lt + t
+    pminsw       m2, m1
+    pand         m4, m5                 ; (l - lt + t)&mask
+    pminsw       m4, m0
+    pmaxsw       m4, m2                 ; pred
+    psubw        m3, m4                 ; l - pred
+    pand         m3, m5
+    movu [dstq + wq], m3
+    add          wq, 16
+    js        .loop
+
+    cmp          wd, mmsize-1
+    jne       .tail
+
+    movzx     src1d, word [src1q + (mmsize-1) - 2]
+    movzx     src2d, word [src2q + (mmsize-1) - 2]
+    mov [left_topq], src1d
+    mov     [leftq], src2d
+    RET
+.tail:
+    mov          wq, -1
+    jmp       .loop
diff --git a/libavcodec/x86/huffyuvencdsp_init.c b/libavcodec/x86/huffyuvencdsp_init.c

index 153edabf02dc2b62e847a11999b2097e52e9d5b1..e32b7ea19db8ef5fbfe1acfaa8035813078efe0f 100644 (file)
--- a/libavcodec/x86/huffyuvencdsp_init.c
+++ b/libavcodec/x86/huffyuvencdsp_init.c
@@ -33,6 +33,8 @@ void ff_diff_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src
                          unsigned mask, int w);
  void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
                                            unsigned mask, int w, int *left, int *left_top);
+void ff_sub_hfyu_median_pred_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+                                        unsigned mask, int w, int *left, int *left_top);
  
  av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int width)
  {
@@ -44,6 +46,8 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int wid
  
      if (EXTERNAL_SSE2(cpu_flags)) {
          c->diff_int16 = ff_diff_int16_sse2;
+        if (bpp < 16 && width >= 8)
+            c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_sse2;
      }
  
      if (EXTERNAL_AVX2_FAST(cpu_flags)) {
author	Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
	Wed, 25 Feb 2026 23:43:09 +0000 (00:43 +0100)
committer	Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
	Sun, 1 Mar 2026 11:03:55 +0000 (12:03 +0100)
libavcodec/x86/huffyuvencdsp.asm		patch \| blob \| history
libavcodec/x86/huffyuvencdsp_init.c		patch \| blob \| history