movzx maskd, word [src2q + wq - 2]
mov [leftq], maskd
RET
+
+INIT_XMM sse2
+cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, src2, mask, w, left, left_top
+ movd m5, maskd
+ lea wd, [wd+wd-(mmsize-1)]
+ movu m0, [src1q]
+ movu m2, [src2q]
+ SPLATW m5, m5
+ add dstq, wq
+ movd m1, [left_topq]
+ neg wq
+ movd m3, [leftq]
+ sub src1q, wq
+ sub src2q, wq
+ pslldq m0, 2
+ pslldq m2, 2
+ por m0, m1
+ por m2, m3
+ jmp .init
+
+.loop:
+ movu m0, [src1q + wq - 2] ; lt
+ movu m2, [src2q + wq - 2] ; l
+.init:
+ movu m1, [src1q + wq] ; t
+ movu m3, [src2q + wq]
+ psubw m4, m2, m0 ; l - lt
+ pmaxsw m0, m1, m2
+ paddw m4, m1 ; l - lt + t
+ pminsw m2, m1
+ pand m4, m5 ; (l - lt + t)&mask
+ pminsw m4, m0
+ pmaxsw m4, m2 ; pred
+ psubw m3, m4 ; l - pred
+ pand m3, m5
+ movu [dstq + wq], m3
+ add wq, 16
+ js .loop
+
+ cmp wd, mmsize-1
+ jne .tail
+
+ movzx src1d, word [src1q + (mmsize-1) - 2]
+ movzx src2d, word [src2q + (mmsize-1) - 2]
+ mov [left_topq], src1d
+ mov [leftq], src2d
+ RET
+.tail:
+ mov wq, -1
+ jmp .loop
unsigned mask, int w);
void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
unsigned mask, int w, int *left, int *left_top);
+void ff_sub_hfyu_median_pred_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+ unsigned mask, int w, int *left, int *left_top);
av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int width)
{
if (EXTERNAL_SSE2(cpu_flags)) {
c->diff_int16 = ff_diff_int16_sse2;
+ if (bpp < 16 && width >= 8)
+ c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_sse2;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {