uyvytoyuv422 converts packed UYVY, whose macroblocks are pixel pairs, and
the SIMD code only handled even widths. On an odd width the trailing half
macroblock made the kernel write past the end of the Y/U/V destinations:
the AVX512ICL masked tail dropped the odd pixel and the fall-through
re-entered the SIMD loop, writing a full mmsize*2 chunk past the planes
(127 bytes of Y, 63 of U and 63 of V); the sse2/avx/avx2 scalar tail
wrote one byte past the Y plane.
Process only whole pairs and emit the trailing odd column from a small
per-row epilogue that matches uyvytoyuv422_c (ydst[w-1] = src[2w-1],
udst[cw-1] = src[2w-2], vdst[cw-1] = src[2w]).
All four SIMD variants are now bit-exact with the C reference for even and
odd widths and no longer overwrite the destination. Verified on AVX512ICL
hardware (Ryzen 9 9950X) with checkasm.
Found-by: Claude (Anthropic). Human-verified and reported by Omkhar Arasaratnam <omkhar@linkedin.com>.
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
(cherry picked from commit
21782b7b3143a3ed68de635c83b2094523e4cf39)
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
movsxdifnidn src_strideq, src_strided
mov back_wq, wq
+ and wq, -2 ; process whole UYVY pairs; trailing odd column via epilogue
mov whalfq, wq
shr whalfq, 1 ; whalf = width / 2
;calc scalar loop count
and xq, mmsize * 2 - 1
- je .loop_simd
+ je .skip_tail
%if mmsize == 64
shr xq, 1
%endif
; check if simd loop is need
+.skip_tail:
cmp wq, 0
jge .end_line
jl .loop_simd
.end_line:
+ test back_wq, 1
+ jz .skip_last
+ mov tmpb, [srcq + 1]
+ mov [ydstq], tmpb
+ mov tmpb, [srcq + 0]
+ mov [udstq], tmpb
+ mov tmpb, [srcq + 2]
+ mov [vdstq], tmpb
+ .skip_last:
add srcq, src_strideq
add ydstq, lum_strideq
add udstq, chrom_strideq
;restore initial state of line variable
mov wq, back_wq
+ and wq, -2
mov xq, wq
mov whalfq, wq
shr whalfq, 1 ; whalf = width / 2