diff --git a/common/frame.c b/common/frame.c index bba0da9..0ea67a8 100644 --- a/common/frame.c +++ b/common/frame.c @@ -460,13 +460,15 @@ static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int } } } -static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void deblock_v_chroma_c( uint8_t *pixu, int stride, int alpha, int beta, int8_t *tc0, uint8_t *pixv ) { - deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 ); + deblock_chroma_c( pixu, stride, 1, alpha, beta, tc0 ); + deblock_chroma_c( pixv, stride, 1, alpha, beta, tc0 ); } -static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void deblock_h_chroma_c( uint8_t *pixu, int stride, int alpha, int beta, int8_t *tc0, uint8_t *pixv ) { - deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 ); + deblock_chroma_c( pixu, 1, stride, alpha, beta, tc0 ); + deblock_chroma_c( pixv, 1, stride, alpha, beta, tc0 ); } static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta ) @@ -540,16 +542,18 @@ static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystrid pix += ystride; } } -static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta ) +static void deblock_v_chroma_intra_c( uint8_t *pixu, int stride, int alpha, int beta, uint8_t *pixv ) { - deblock_chroma_intra_c( pix, stride, 1, alpha, beta ); + deblock_chroma_intra_c( pixu, stride, 1, alpha, beta ); + deblock_chroma_intra_c( pixv, stride, 1, alpha, beta ); } -static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta ) +static void deblock_h_chroma_intra_c( uint8_t *pixu, int stride, int alpha, int beta, uint8_t *pixv ) { - deblock_chroma_intra_c( pix, 1, stride, alpha, beta ); + deblock_chroma_intra_c( pixu, 1, stride, alpha, beta ); + deblock_chroma_intra_c( pixv, 1, stride, alpha, beta ); } -static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter ) +static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, void *func ) { const int index_a = i_qp + h->sh.i_alpha_c0_offset; const int alpha = alpha_table(index_a); @@ -564,12 +568,13 @@ static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_ tc[2] = tc0_table(index_a)[bS[2]] + b_chroma; tc[3] = tc0_table(index_a)[bS[3]] + b_chroma; - pf_inter( pix1, i_stride, alpha, beta, tc ); if( b_chroma ) - pf_inter( pix2, i_stride, alpha, beta, tc ); + ((x264_deblock_chroma_inter_t)func)( pix1, i_stride, alpha, beta, tc, pix2 ); + else + ((x264_deblock_luma_inter_t)func)( pix1, i_stride, alpha, beta, tc ); } -static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra ) +static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, void *func ) { const int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset); const int beta = beta_table(i_qp + h->sh.i_beta_offset); @@ -577,9 +582,10 @@ static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, if( !alpha || !beta ) return; - pf_intra( pix1, i_stride, alpha, beta ); if( b_chroma ) - pf_intra( pix2, i_stride, alpha, beta ); + ((x264_deblock_chroma_intra_t)func)( pix1, i_stride, alpha, beta, pix2 ); + else + ((x264_deblock_luma_intra_t)func)( pix1, i_stride, alpha, beta ); } void x264_frame_deblock_row( x264_t *h, int mb_y ) @@ -755,10 +761,12 @@ void x264_frame_deblock( x264_t *h ) } #ifdef HAVE_MMX -void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); -void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_v_chroma_mmxext( uint8_t *pixu, int stride, int alpha, int beta, int8_t *tc0, uint8_t *pixv ); +void x264_deblock_v_chroma_sse2 ( uint8_t *pixu, int stride, int alpha, int beta, int8_t *tc0, uint8_t *pixv ); +void x264_deblock_h_chroma_mmxext( uint8_t *pixu, int stride, int alpha, int beta, int8_t *tc0, uint8_t *pixv ); +void x264_deblock_v_chroma_intra_mmxext( uint8_t *pixu, int stride, int alpha, int beta, uint8_t *pixv ); +void x264_deblock_v_chroma_intra_sse2 ( uint8_t *pixu, int stride, int alpha, int beta, uint8_t *pixv ); +void x264_deblock_h_chroma_intra_mmxext( uint8_t *pixu, int stride, int alpha, int beta, uint8_t *pixv ); void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); @@ -816,8 +824,10 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) { pf->deblock_v_luma = x264_deblock_v_luma_sse2; pf->deblock_h_luma = x264_deblock_h_luma_sse2; + pf->deblock_v_chroma = x264_deblock_v_chroma_sse2; pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2; pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2; + pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_sse2; } } #endif diff --git a/common/frame.h b/common/frame.h index 523689f..5c266b9 100644 --- a/common/frame.h +++ b/common/frame.h @@ -86,18 +86,20 @@ typedef struct } x264_frame_t; -typedef void (*x264_deblock_inter_t)( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); -typedef void (*x264_deblock_intra_t)( uint8_t *pix, int stride, int alpha, int beta ); +typedef void (*x264_deblock_luma_inter_t)( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +typedef void (*x264_deblock_luma_intra_t)( uint8_t *pix, int stride, int alpha, int beta ); +typedef void (*x264_deblock_chroma_inter_t)( uint8_t *pixu, int stride, int alpha, int beta, int8_t *tc0, uint8_t *pixv ); +typedef void (*x264_deblock_chroma_intra_t)( uint8_t *pixu, int stride, int alpha, int beta, uint8_t *pixv ); typedef struct { - x264_deblock_inter_t deblock_v_luma; - x264_deblock_inter_t deblock_h_luma; - x264_deblock_inter_t deblock_v_chroma; - x264_deblock_inter_t deblock_h_chroma; - x264_deblock_intra_t deblock_v_luma_intra; - x264_deblock_intra_t deblock_h_luma_intra; - x264_deblock_intra_t deblock_v_chroma_intra; - x264_deblock_intra_t deblock_h_chroma_intra; + x264_deblock_luma_inter_t deblock_v_luma; + x264_deblock_luma_inter_t deblock_h_luma; + x264_deblock_chroma_inter_t deblock_v_chroma; + x264_deblock_chroma_inter_t deblock_h_chroma; + x264_deblock_luma_intra_t deblock_v_luma_intra; + x264_deblock_luma_intra_t deblock_h_luma_intra; + x264_deblock_chroma_intra_t deblock_v_chroma_intra; + x264_deblock_chroma_intra_t deblock_h_chroma_intra; } x264_deblock_function_t; x264_frame_t *x264_frame_new( x264_t *h ); diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index 8b5ef26..7f6d625 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -755,14 +755,16 @@ INIT_MMX %macro CHROMA_V_START 0 dec r2d ; alpha-1 dec r3d ; beta-1 - mov t5, r0 - sub t5, r1 - sub t5, r1 +.skip_dec: + lea t5, [r1*2] + sub r0, t5 + add t5, r1 %endmacro %macro CHROMA_H_START 0 dec r2d dec r3d +.skip_dec: sub r0, 2 lea t6, [r1*3] mov t5, r0 @@ -773,29 +775,48 @@ INIT_MMX %define t6 r6 ;----------------------------------------------------------------------------- -; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void x264_deblock_v_chroma_mmxext( uint8_t *pixu, int stride, int alpha, int beta, int8_t *tc0, uint8_t *pixv ) +; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pixu, int stride, int alpha, int beta, uint8_t *pixv ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_mmxext, 5,6 +%macro DECL_CHROMA 4 ; dir, intra, args, regs +%ifdef ARCH_X86_64 +cglobal x264_deblock_%1_chroma_%2mmxext, %3,%4 + mov r10, r%3 + call deblock_%{1}1_chroma_%2mmxext + mov r0, r10 + jmp deblock_%{1}1_chroma_%2mmxext.skip_dec +%else +cglobal x264_deblock_%1_chroma_%2mmxext, %3,%4 + call deblock_%{1}1_chroma_%2mmxext + mov r0, r %+ %3m + call deblock_%{1}1_chroma_%2mmxext.skip_dec + RET +%endif +%endmacro + +DECL_CHROMA v, , 5, 6 +DECL_CHROMA h, , 5, 7 + +ALIGN 16 +deblock_v1_chroma_mmxext: CHROMA_V_START - movq m0, [t5] - movq m1, [t5+r1] - movq m2, [r0] - movq m3, [r0+r1] + movq m0, [r0] + movq m1, [r0+r1] + movq m2, [r0+r1*2] + movq m3, [r0+t5] call chroma_inter_body_mmxext - movq [t5+r1], m1 - movq [r0], m2 - RET + movq [r0+r1], m1 + movq [r0+r1*2], m2 + ret -;----------------------------------------------------------------------------- -; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_mmxext, 5,7 +ALIGN 16 +deblock_h1_chroma_mmxext: %ifdef ARCH_X86_64 %define buf0 [rsp-24] %define buf1 [rsp-16] %else - %define buf0 r0m - %define buf1 r2m + %define buf0 r1m + %define buf1 r3m %endif CHROMA_H_START TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) @@ -805,7 +826,7 @@ cglobal x264_deblock_h_chroma_mmxext, 5,7 movq m0, buf0 movq m3, buf1 TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) - RET + ret ALIGN 16 chroma_inter_body_mmxext: @@ -821,7 +842,7 @@ chroma_inter_body_mmxext: ; in: %1=p0 %2=p1 %3=q1 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 %macro CHROMA_INTRA_P0 3 - movq m4, %1 + mova m4, %1 pxor m4, %3 pand m4, [pb_01 GLOBAL] ; m4 = (p0^q1)&1 pavgb %1, %3 @@ -832,29 +853,28 @@ chroma_inter_body_mmxext: %define t5 r4 %define t6 r5 -;----------------------------------------------------------------------------- -; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_intra_mmxext, 4,5 +DECL_CHROMA v, intra_, 4, 5 +DECL_CHROMA h, intra_, 4, 6 + +ALIGN 16 +deblock_v1_chroma_intra_mmxext: CHROMA_V_START - movq m0, [t5] - movq m1, [t5+r1] - movq m2, [r0] - movq m3, [r0+r1] + movq m0, [r0] + movq m1, [r0+r1] + movq m2, [r0+r1*2] + movq m3, [r0+t5] call chroma_intra_body_mmxext - movq [t5+r1], m1 - movq [r0], m2 - RET + movq [r0+r1], m1 + movq [r0+r1*2], m2 + ret -;----------------------------------------------------------------------------- -; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_intra_mmxext, 4,6 +ALIGN 16 +deblock_h1_chroma_intra_mmxext: CHROMA_H_START TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) call chroma_intra_body_mmxext TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) - RET + ret ALIGN 16 chroma_intra_body_mmxext: @@ -870,3 +890,76 @@ chroma_intra_body_mmxext: paddb m1, m5 paddb m2, m6 ret + + + +INIT_XMM + +%define t5 r6 +;%define t6 r7 ; FIXME + +cglobal x264_deblock_v_chroma_sse2, 6,7 + CHROMA_V_START + sub r5, r1 + sub r5, r1 + movq m0, [r0] + movq m1, [r0+r1] + movq m2, [r0+r1*2] + movq m3, [r0+t5] + movhps m0, [r5] + movhps m1, [r5+r1] + movhps m2, [r5+r1*2] + movhps m3, [r5+t5] + call chroma_inter_body_sse2 + movq [r0+r1], m1 + movq [r0+r1*2], m2 + movhps [r5+r1], m1 + movhps [r5+r1*2], m2 + RET + +ALIGN 16 +chroma_inter_body_sse2: + LOAD_MASK r2d, r3d + movd m6, [r4] ; tc0 + punpcklbw m6, m6 + movlhps m6, m6 + pand m7, m6 + DEBLOCK_P0_Q0 + ret + +%define t5 r5 + +;FIXME dupe +cglobal x264_deblock_v_chroma_intra_sse2, 5,6 + CHROMA_V_START + sub r4, r1 + sub r4, r1 + movq m0, [r0] + movq m1, [r0+r1] + movq m2, [r0+r1*2] + movq m3, [r0+t5] + movhps m0, [r4] + movhps m1, [r4+r1] + movhps m2, [r4+r1*2] + movhps m3, [r4+t5] + call chroma_intra_body_sse2 + movq [r0+r1], m1 + movq [r0+r1*2], m2 + movhps [r4+r1], m1 + movhps [r4+r1*2], m2 + RET + +ALIGN 16 +chroma_intra_body_sse2: + LOAD_MASK r2d, r3d + mova m5, m1 + mova m6, m2 + CHROMA_INTRA_P0 m1, m0, m3 + CHROMA_INTRA_P0 m2, m3, m0 + psubb m1, m5 + psubb m2, m6 + pand m1, m7 + pand m2, m7 + paddb m1, m5 + paddb m2, m6 + ret diff --git a/tools/checkasm.c b/tools/checkasm.c index c21c865..845fe94 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -871,6 +871,7 @@ static int check_deblock( int cpu_ref, int cpu_new ) for( i = 0; i < 36; i++ ) \ { \ int off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */\ + uint8_t *pix; \ for( j = 0; j < 1024; j++ ) \ /* two distributions of random to excersize different failure modes */\ buf3[j] = rand() & (i&1 ? 0xf : 0xff ); \ @@ -879,27 +880,31 @@ static int check_deblock( int cpu_ref, int cpu_new ) { \ set_func_name( #name );\ used_asm = 1; \ - call_c1( db_c.name, buf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ - call_a1( db_a.name, buf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ + pix = buf3+off; \ + call_c1( db_c.name, pix, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ + pix = buf4+off; \ + call_a1( db_a.name, pix, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ if( memcmp( buf3, buf4, 1024 ) ) \ { \ ok = 0; \ fprintf( stderr, #name "(a=%d, b=%d): [FAILED]\n", alphas[i], betas[i] ); \ break; \ } \ - call_c2( db_c.name, buf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ - call_a2( db_a.name, buf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ + pix = buf3+off; \ + call_c2( db_c.name, pix, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ + pix = buf4+off; \ + call_a2( db_a.name, pix, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ } \ } TEST_DEBLOCK( deblock_h_luma, 0, tcs[i] ); TEST_DEBLOCK( deblock_v_luma, 1, tcs[i] ); - TEST_DEBLOCK( deblock_h_chroma, 0, tcs[i] ); - TEST_DEBLOCK( deblock_v_chroma, 1, tcs[i] ); + TEST_DEBLOCK( deblock_h_chroma, 0, tcs[i], pix+16 ); + TEST_DEBLOCK( deblock_v_chroma, 1, tcs[i], pix+16 ); TEST_DEBLOCK( deblock_h_luma_intra, 0 ); TEST_DEBLOCK( deblock_v_luma_intra, 1 ); - TEST_DEBLOCK( deblock_h_chroma_intra, 0 ); - TEST_DEBLOCK( deblock_v_chroma_intra, 1 ); + TEST_DEBLOCK( deblock_h_chroma_intra, 0, pix+16 ); + TEST_DEBLOCK( deblock_v_chroma_intra, 1, pix+16 ); report( "deblock :" );