commit ee89d8dd67f42b33eb03c100c498e2429bfb7a91 Author: Loren Merritt Date: Sun Jun 27 21:26:01 2010 +0000 ctz-based pixel_ads 20% slower breaks x86_32 diff --git a/common/macroblock.c b/common/macroblock.c index 4561d8a..90e8348 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -341,7 +341,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead ) int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int); int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range); int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) * - ((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t)); + ((me_range*2*17/16+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t)); scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa ); } int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+3)&~3) * sizeof(int); diff --git a/common/pixel.c b/common/pixel.c index 8441c7a..896cbef 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -694,7 +694,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT7( satd_x3, _mmxext ); INIT7( satd_x4, _mmxext ); INIT4( hadamard_ac, _mmxext ); - INIT_ADS( _mmxext ); +// INIT_ADS( _mmxext ); pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmxext; #if ARCH_X86 diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 1756f86..6aa6c13 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -2146,17 +2146,16 @@ cglobal pixel_ssim_end4_sse2, 3,3,7 %ifdef WIN64 movsxd r5, r5d %endif - mov r0d, r5d - lea r6, [r4+r5+15] - and r6, ~15; + mov r10d, r5d + lea r6, [r4+r5*2] shl r2d, 1 %endmacro %macro ADS_END 1 ; unroll_size add r1, 8*%1 add r3, 8*%1 - add r6, 4*%1 - sub r0d, 4*%1 + add r6, %1/2 + sub r5d, 4*%1 jg .loop %ifdef WIN64 RESTORE_XMM rsp @@ -2197,7 +2196,9 @@ cglobal pixel_ads4_mmxext, 6,7 pshufw mm1, r6m, 0 paddusw mm0, [r3] psubusw mm1, mm0 + pxor mm2, mm2 packsswb mm1, mm1 + pcmpeqb mm1, mm2 movd [r6], mm1 ADS_END 1 @@ -2206,6 +2207,7 @@ cglobal pixel_ads2_mmxext, 6,7 pshufw mm5, r6m, 0 pshufw mm7, mm6, 0 pshufw mm6, mm6, 0xAA + pxor mm4, mm4 ADS_START .loop: movq mm0, [r1] @@ -2216,15 +2218,17 @@ cglobal pixel_ads2_mmxext, 6,7 ABS1 mm1, mm3 paddw mm0, mm1 paddusw mm0, [r3] - movq mm4, mm5 - psubusw mm4, mm0 - packsswb mm4, mm4 - movd [r6], mm4 + movq mm2, mm5 + psubusw mm2, mm0 + packsswb mm2, mm2 + pcmpeqb mm2, mm4 + movd [r6], mm2 ADS_END 1 cglobal pixel_ads1_mmxext, 6,7 pshufw mm7, [r0], 0 pshufw mm6, r6m, 0 + pxor mm4, mm4 ADS_START .loop: movq mm0, [r1] @@ -2235,16 +2239,17 @@ cglobal pixel_ads1_mmxext, 6,7 ABS1 mm1, mm3 paddusw mm0, [r3] paddusw mm1, [r3+8] - movq mm4, mm6 - movq mm5, mm6 - psubusw mm4, mm0 - psubusw mm5, mm1 - packsswb mm4, mm5 - movq [r6], mm4 + movq mm2, mm6 + movq mm3, mm6 + psubusw mm2, mm0 + psubusw mm3, mm1 + packsswb mm2, mm3 + pcmpeqb mm2, mm4 + movq [r6], mm2 ADS_END 2 %macro ADS_SSE2 1 -cglobal pixel_ads4_%1, 6,7,12 +cglobal pixel_ads4_%1, 6,7,13 movdqa xmm4, [r0] pshuflw xmm7, xmm4, 0 pshuflw xmm6, xmm4, 0xAA @@ -2260,6 +2265,7 @@ cglobal pixel_ads4_%1, 6,7,12 ADS_START movdqu xmm10, [r1] movdqu xmm11, [r1+r2] + pxor xmm12, xmm12 .loop: movdqa xmm0, xmm10 movdqu xmm1, [r1+16] @@ -2283,7 +2289,9 @@ cglobal pixel_ads4_%1, 6,7,12 movdqa xmm1, xmm8 psubusw xmm1, xmm0 packsswb xmm1, xmm1 - movq [r6], xmm1 + pcmpeqb xmm1, xmm12 + pmovmskb r0d, xmm1 + mov [r6], r0b %else ADS_START .loop: @@ -2308,8 +2316,11 @@ cglobal pixel_ads4_%1, 6,7,12 punpcklqdq xmm1, xmm1 paddusw xmm0, xmm2 psubusw xmm1, xmm0 + pxor xmm3, xmm3 packsswb xmm1, xmm1 - movq [r6], xmm1 + pcmpeqb xmm1, xmm3 + pmovmskb r0d, xmm1 + mov [r6], r0b %endif ; ARCH ADS_END 2 @@ -2335,8 +2346,11 @@ cglobal pixel_ads2_%1, 6,7,8 paddusw xmm0, xmm4 movdqa xmm1, xmm5 psubusw xmm1, xmm0 + pxor xmm3, xmm3 packsswb xmm1, xmm1 - movq [r6], xmm1 + pcmpeqb xmm1, xmm3 + pmovmskb r0d, xmm1 + mov [r6], r0b ADS_END 2 cglobal pixel_ads1_%1, 6,7,8 @@ -2362,8 +2376,11 @@ cglobal pixel_ads1_%1, 6,7,8 movdqa xmm5, xmm6 psubusw xmm4, xmm0 psubusw xmm5, xmm1 + pxor xmm3, xmm3 packsswb xmm4, xmm5 - movdqa [r6], xmm4 + pcmpeqb xmm4, xmm3 + pmovmskb r0d, xmm4 + mov [r6], r0w ADS_END 4 %endmacro @@ -2386,56 +2403,40 @@ ADS_SSE2 ssse3 ; return nmv; ; } -%macro TEST 1 - mov [r4+r0*2], r1w - test r2d, 0xff<<(%1*8) - setne r3b - add r0d, r3d - inc r1d -%endmacro - cglobal pixel_ads_mvs, 0,7,0 ads_mvs: - lea r6, [r4+r5+15] - and r6, ~15; + ; clear last block in case width isn't divisible by 8. (assume divisible by 4) + lea r6, [r4+r10*2] + lea r5d, [r10+4] + shr r5d, 3 + mov qword [r6+r5], -1 + jc .skip_clear + or byte [r6+r5-1], 0xf0 +.skip_clear: ; mvs = r4 ; masks = r6 ; width = r5 - ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.) xor r0d, r0d - xor r1d, r1d - mov [r6+r5], r0d - jmp .loopi -ALIGN 16 -.loopi0: - add r1d, 8 - cmp r1d, r5d - jge .end + mov r10d, r4d .loopi: - mov r2, [r6+r1] -%ifdef ARCH_X86_64 - test r2, r2 -%else - mov r3, r2 - or r3d, [r6+r1+4] -%endif - jz .loopi0 - xor r3d, r3d - TEST 0 - TEST 1 - TEST 2 - TEST 3 -%ifdef ARCH_X86_64 - shr r2, 32 -%else - mov r2d, [r6+r1] -%endif - TEST 0 - TEST 1 - TEST 2 - TEST 3 - cmp r1d, r5d - jl .loopi + mov r2, [r6+r0] + lea r1d, [r0*8-1] + xor r2, -1 + jz .endbit +.loopbit: + bsf r3, r2 + lea r1d, [r1+r3+1] + mov [r4], r1w + shr r2, r3b + add r4, 2 + shr r2, 1 + jnz .loopbit +.endbit: + add r0d, 8 + sub r5d, 8 + jg .loopi .end: - movifnidn eax, r0d + sub r4d, r10d + shr r4d, 1 + movifnidn eax, r4d RET diff --git a/encoder/me.c b/encoder/me.c index 19c5b2b..5330fbf 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -25,6 +25,7 @@ #include "common/common.h" #include "macroblock.h" #include "me.h" +#include /* presets selected from good points on the speed-vs-quality curve of several test videos * subpel_iters[i_subpel_refine] = { refine_hpel, refine_qpel, me_hpel, me_qpel } @@ -609,7 +610,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, if( h->mb.i_me_method == X264_ME_TESA ) { // ADS threshold, then SAD threshold, then keep the best few SADs, then SATD - mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15) + 4); + mvsad_t *mvsads = (mvsad_t *)(xs + ALIGN(width*17/16,16) + 4); int nmvsad = 0, limit; int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12; int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+bmy*stride+bmx, stride ) @@ -621,8 +622,10 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, if( bsad <= ycost ) continue; bsad -= ycost; + {START_TIMER; xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta, cost_fpel_mvx+min_x, xs, width, bsad * 17 >> 4 ); + STOP_TIMER("tesa");} for( i = 0; i < xn-2; i += 3 ) { pixel *ref = p_fref_w+min_x+my*stride; @@ -712,8 +715,10 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, if( bcost <= ycost ) continue; bcost -= ycost; + {START_TIMER; xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta, cost_fpel_mvx+min_x, xs, width, bcost ); + STOP_TIMER("esa");} for( i = 0; i < xn-2; i += 3 ) COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my ); bcost += ycost; diff --git a/tools/checkasm.c b/tools/checkasm.c index 7fa2c0c..cdded5b 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -241,7 +241,7 @@ static int check_pixel( int cpu_ref, int cpu_new ) x264_predict8x8_t predict_8x8[9+3]; x264_predict_8x8_filter_t predict_8x8_filter; ALIGNED_16( pixel edge[33] ); - uint16_t cost_mv[32]; + uint16_t cost_mv[100]; int ret = 0, ok, used_asm; x264_pixel_init( 0, &pixel_c ); @@ -462,24 +462,26 @@ static int check_pixel( int cpu_ref, int cpu_new ) } ok = 1; used_asm = 0; - for( int i = 0; i < 32; i++ ) + for( int i = 0; i < 100; i++ ) cost_mv[i] = i*10; for( int i = 0; i < 100 && ok; i++ ) if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] ) { - ALIGNED_16( uint16_t sums[72] ); + int n = 100; + int stride = 32; + ALIGNED_16( uint16_t sums[n+stride+16] ); ALIGNED_16( int dc[4] ); - int16_t mvs_a[32], mvs_c[32]; + int16_t mvs_a[n*17/16+4], mvs_c[n*17/16+4]; int mvn_a, mvn_c; int thresh = rand() & 0x3fff; set_func_name( "esa_ads" ); - for( int j = 0; j < 72; j++ ) + for( int j = 0; j < n+stride+16; j++ ) sums[j] = rand() & 0x3fff; for( int j = 0; j < 4; j++ ) dc[j] = rand() & 0x3fff; used_asm = 1; - mvn_c = call_c( pixel_c.ads[i&3], dc, sums, 32, cost_mv, mvs_c, 28, thresh ); - mvn_a = call_a( pixel_asm.ads[i&3], dc, sums, 32, cost_mv, mvs_a, 28, thresh ); + mvn_c = call_c( pixel_c.ads[i&3], dc, sums, stride, cost_mv, mvs_c, n, thresh ); + mvn_a = call_a( pixel_asm.ads[i&3], dc, sums, stride, cost_mv, mvs_a, n, thresh ); if( mvn_c != mvn_a || memcmp( mvs_c, mvs_a, mvn_c*sizeof(*mvs_c) ) ) { ok = 0;