Index: encoder/me.c =================================================================== --- encoder/me.c (revision 27) +++ encoder/me.c (working copy) @@ -25,10 +25,24 @@ #include #include #include +#include #include "common/common.h" #include "me.h" +#define BENCH 0 + +#if BENCH +static uint64_t sad16_calls=0, sad16_new=0; +static uint64_t sad8_calls=0, sad8_new=0; +static uint64_t sad_print=1; +#define SAD_CALL(s) sad##s##_calls++; +#define SAD_NEW(s) sad##s##_new++; +#else +#define SAD_CALL(s) +#define SAD_NEW(s) +#endif + /* presets selected from good points on the speed-vs-quality curve of several test videos * subpel_iters[i_subpel_refine] = { refine_hpel, refine_qpel, me_hpel, me_qpel } * where me_* are the number of EPZS iterations run on all candidate block types, @@ -43,19 +57,38 @@ {0,0,2,3}}; static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters ); +static void refine_subpel_multipart( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int hpel_iters, int qpel_iters ); +//FIXME +#if 0 +#define COPY3_IF_LT(x,y,a,b,c,d)\ +asm volatile (\ + "cmpl %0, %3 \n\t"\ + "cmovl %3, %0 \n\t"\ + "cmovl %4, %1 \n\t"\ + "cmovl %5, %2 \n\t"\ + : "+r" (x), "+r" (a), "+r" (c)\ + : "r" (y), "r" (b), "r" (d)\ +); +#else +#define COPY3_IF_LT(x,y,a,b,c,d) \ +{ \ + if((y)<(x)) \ + { \ + (x)=(y);\ + (a)=(b);\ + (c)=(d);\ + } \ +} +#endif + #define COST_MV( mx, my ) \ { \ int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], m->i_stride[0], \ &p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0] ) \ + p_cost_mvx[ (mx)<<2 ] \ + p_cost_mvy[ (my)<<2 ]; \ - if( cost < bcost ) \ - { \ - bcost = cost; \ - bmx = mx; \ - bmy = my; \ - } \ + COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \ } void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_fullpel_thresh ) @@ -86,7 +119,7 @@ bmx = pmx = x264_clip3( ( m->mvp[0] + 2 ) >> 2, mv_x_min, mv_x_max ); bmy = pmy = x264_clip3( ( m->mvp[1] + 2 ) >> 2, mv_y_min, mv_y_max ); bcost = COST_MAX; - COST_MV( bmx, bmy ); + COST_MV( pmx, pmy ); /* I don't know why this helps */ bcost -= p_cost_mvx[ bmx<<2 ] + p_cost_mvy[ bmy<<2 ]; @@ -283,6 +316,7 @@ } #undef COST_MV + void x264_me_refine_qpel( x264_t *h, x264_me_t *m ) { int hpel = subpel_iterations[h->mb.i_subpel_refine][0]; @@ -354,4 +388,507 @@ m->mv[1] = bmy; m->cost_mv = p_cost_mvx[ bmx ] + p_cost_mvy[ bmy ]; } +#undef COST_MV + + + +#define MVD( mvp, mx, my ) \ + p_cost_mv[ ((mx)<<2) - (mvp)[0] ] + p_cost_mv[ ((my)<<2) - (mvp)[1] ] + +static int cost_mv_multipart_16x16( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int mx, int my ) +{ + const int i_stride = m->i_stride[0]; + const int16_t *p_cost_mv = m->p_cost_mv; + int sad[4]; + int i8; + int mx2 = mx<<2, my2 = my<<2; + uint8_t *cache_entry; + + SAD_CALL(16); + if( CACHE_FPEL_IN_RANGE( mx, my ) ) + { + cache_entry = &l->p_cache_fpel[ CACHE_FPEL_OFFS( mx, my ) ]; + if( *cache_entry ) + return 0xffff; + *cache_entry = 1; + } + SAD_NEW(16); + + h->pixf.sad_split16x16( m->p_fenc[0], i_stride, &m->p_fref[0][mx+my*i_stride], i_stride, sad ); + + for( i8 = 0; i8 < 4; i8++ ) + COPY3_IF_LT( l->me8x8[i8].cost, sad[i8] + MVD(l->me8x8[i8].mvp, mx, my), + l->me8x8[i8].mv[0], mx2, l->me8x8[i8].mv[1], my2 ); + COPY3_IF_LT( l->me16x8[0].cost, sad[0]+sad[1] + MVD(l->me16x8[0].mvp, mx, my), + l->me16x8[0].mv[0], mx2, l->me16x8[0].mv[1], my2 ); + COPY3_IF_LT( l->me16x8[1].cost, sad[2]+sad[3] + MVD(l->me16x8[1].mvp, mx, my), + l->me16x8[1].mv[0], mx2, l->me16x8[1].mv[1], my2 ); + COPY3_IF_LT( l->me8x16[0].cost, sad[0]+sad[2] + MVD(l->me8x16[0].mvp, mx, my), + l->me8x16[0].mv[0], mx2, l->me8x16[0].mv[1], my2 ); + COPY3_IF_LT( l->me8x16[1].cost, sad[1]+sad[3] + MVD(l->me8x16[1].mvp, mx, my), + l->me8x16[1].mv[0], mx2, l->me8x16[1].mv[1], my2 ); + return sad[0]+sad[1]+sad[2]+sad[3] + MVD(m->mvp, mx, my); +} +static int cost_mv_multipart_8x8( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int mx, int my ) +{ + const int i_stride = m->i_stride[0]; + const int16_t *p_cost_mv = m->p_cost_mv; + + SAD_CALL(8); + if( CACHE_FPEL_IN_RANGE( mx, my ) && l->p_cache_fpel[ CACHE_FPEL_OFFS( mx, my ) ] ) + return 0xffff; + SAD_NEW(8); + + return h->pixf.sad[m->i_pixel]( m->p_fenc[0], i_stride, &m->p_fref[0][my*i_stride+mx], i_stride ) + + MVD(m->mvp, mx, my); +} + +static int (*const cost_mv_multipart[4])( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int mx, int my ) = +{ + cost_mv_multipart_16x16, cost_mv_multipart_8x8, cost_mv_multipart_8x8, cost_mv_multipart_8x8 +}; + +#define COST_MV( mx, my ) \ +{ \ + int cost = cost_mv_multipart[ m->i_pixel ]( h, l, m, mx, my ); \ + COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \ +} + +void x264_me_search_multipart( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_fullpel_thresh ) +{ + const int i_me_range = h->param.analyse.i_me_range; + const int16_t *p_cost_mv = m->p_cost_mv; + const int *mvp = m->mvp; + uint8_t *p_fref = m->p_fref[0]; + int bmx, bmy, bcost; + int omx, omy, pmx, pmy; + int i, j; + + int mv_x_min = h->mb.mv_min_fpel[0]; + int mv_y_min = h->mb.mv_min_fpel[1]; + int mv_x_max = h->mb.mv_max_fpel[0]; + int mv_y_max = h->mb.mv_max_fpel[1]; + + /* clamp mvp to inside frame+padding, so that we don't have to check it each iteration */ +// const uint64_t mvp_bak = *(uint64_t*)m->mvp; + if( h->mb.i_me_method == X264_ME_UMH ) + { + m->mvp[0] = x264_clip3( m->mvp[0], h->mb.mv_min[0], h->mb.mv_max[0] ); + m->mvp[1] = x264_clip3( m->mvp[1], h->mb.mv_min[1], h->mb.mv_max[1] ); + } + + if( m->i_pixel == PIXEL_16x16 ) + { + l->me16x8[0].cost = l->me16x8[1].cost = + l->me8x16[0].cost = l->me8x16[1].cost = + l->me8x8[0].cost = l->me8x8[1].cost = + l->me8x8[2].cost = l->me8x8[3].cost = COST_MAX; + + bcost = COST_MAX; + } + else + { + bcost = m->cost; + bmx = ( m->mv[0] + 2 ) >> 2; + bmy = ( m->mv[1] + 2 ) >> 2; + } + + pmx = x264_clip3( ( m->mvp[0] + 2 ) >> 2, mv_x_min, mv_x_max ); + pmy = x264_clip3( ( m->mvp[1] + 2 ) >> 2, mv_y_min, mv_y_max ); + + if( m->i_pixel == PIXEL_16x16 || *(uint64_t*)m->mvp != *(uint64_t*)l->me16x16.mvp ) + { + COST_MV( pmx, pmy ); + /* I don't know why this helps */ + bcost -= MVD( m->mvp, bmx, bmy ); + } + + /* try extra predictors if provided */ + for( i = 0; i < i_mvc; i++ ) + { + const int mx = x264_clip3( ( mvc[i][0] + 2 ) >> 2, mv_x_min, mv_x_max ); + const int my = x264_clip3( ( mvc[i][1] + 2 ) >> 2, mv_y_min, mv_y_max ); + if( mx != bmx || my != bmy ) + COST_MV( mx, my ); + } + + COST_MV( 0, 0 ); + + mv_x_max += 8; + mv_y_max += 8; + mv_x_min -= 8; + mv_y_min -= 8; + + switch( h->mb.i_me_method ) + { + case X264_ME_DIA: + /* diamond search, radius 1 */ +#define DIA1_ITER(mx, my)\ + {\ + omx = mx;\ + omy = my;\ + COST_MV( omx , omy-1 );\ + COST_MV( omx , omy+1 );\ + COST_MV( omx-1, omy );\ + COST_MV( omx+1, omy );\ + } + + for( i = 0; i < i_me_range; i++ ) + { + DIA1_ITER( bmx, bmy ); + if( bmx == omx && bmy == omy ) + break; + } + break; + + case X264_ME_HEX: + /* hexagon search, radius 2 */ +#define HEX2_ITER(mx, my)\ + {\ + omx = mx;\ + omy = my;\ + COST_MV( omx-2, omy );\ + COST_MV( omx-1, omy+2 );\ + COST_MV( omx+1, omy+2 );\ + COST_MV( omx+2, omy );\ + COST_MV( omx+1, omy-2 );\ + COST_MV( omx-1, omy-2 );\ + } + + for( i = 0; i < i_me_range/2; i++ ) + { + HEX2_ITER( bmx, bmy ); + if( bmx == omx && bmy == omy ) + break; + } + /* square refine */ + DIA1_ITER( bmx, bmy ); + COST_MV( omx-1, omy-1 ); + COST_MV( omx-1, omy+1 ); + COST_MV( omx+1, omy-1 ); + COST_MV( omx+1, omy+1 ); + break; + + case X264_ME_UMH: + /* Uneven-cross Multi-Hexagon-grid Search + * as in JM, except without early termination */ + + DIA1_ITER( pmx, pmy ); + if( pmx || pmy ) + DIA1_ITER( 0, 0 ); + DIA1_ITER( bmx, bmy ); + + /* cross */ + omx = bmx; omy = bmy; + for( i = 1; i < i_me_range; i+=2 ) + { + if( omx + i <= mv_x_max ) + COST_MV( omx + i, omy ); + if( omx - i >= mv_x_min ) + COST_MV( omx - i, omy ); + } + for( i = 1; i < i_me_range/2; i+=2 ) + { + if( omy + i <= mv_y_max ) + COST_MV( omx, omy + i ); + if( omy - i >= mv_y_min ) + COST_MV( omx, omy - i ); + } + + /* 5x5 ESA */ + omx = bmx; omy = bmy; + for( i = 0; i < 24; i++ ) + { + static const int square2_x[24] = {1,1,0,-1,-1,-1, 0, 1, 2,2,2,2,1,0,-1,-2,-2,-2,-2,-2,-1, 0, 1, 2}; + static const int square2_y[24] = {0,1,1, 1, 0,-1,-1,-1,-1,0,1,2,2,2, 2, 2, 1, 0,-1,-2,-2,-2,-2,-2}; + COST_MV( omx + square2_x[i], omy + square2_y[i] ); + } + /* hexagon grid */ + omx = bmx; omy = bmy; + for( i = 1; i <= i_me_range/4; i++ ) + { + int bounds_check = 4*i > X264_MIN4( mv_x_max-omx, mv_y_max-omy, omx-mv_x_min, omy-mv_y_min ); + for( j = 0; j < 16; j++ ) + { + static const int hex4_x[16] = {0,-2,-4,-4,-4,-4,-4,-2, 0, 2, 4, 4,4,4,4,2}; + static const int hex4_y[16] = {4, 3, 2, 1, 0,-1,-2,-3,-4,-3,-2,-1,0,1,2,3}; + int mx = omx + hex4_x[j]*i; + int my = omy + hex4_y[j]*i; + if( !bounds_check || ( mx >= mv_x_min && mx <= mv_x_max + && my >= mv_y_min && my <= mv_y_max ) ) + COST_MV( mx, my ); + } + } + + /* iterative search */ + for( i = 0; i < i_me_range; i++ ) + { + HEX2_ITER( bmx, bmy ); + if( bmx == omx && bmy == omy ) + break; + } + for( i = 0; i < i_me_range; i++ ) + { + DIA1_ITER( bmx, bmy ); + if( bmx == omx && bmy == omy ) + break; + } + break; + + case X264_ME_ESA: + { + const int min_x = X264_MAX( bmx - i_me_range, mv_x_min); + const int min_y = X264_MAX( bmy - i_me_range, mv_y_min); + const int max_x = X264_MIN( bmx + i_me_range, mv_x_max); + const int max_y = X264_MIN( bmy + i_me_range, mv_y_max); + for( omy = min_y; omy <= max_y; omy++ ) + for( omx = min_x; omx <= max_x; omx++ ) + { + COST_MV( omx, omy ); + } + } + break; + } + + /* -> qpel mv */ + m->mv[0] = bmx << 2; + m->mv[1] = bmy << 2; + + /* +#define SHIFT_QMV( m ) \ + l->m.mv[0] <<= 2; \ + l->m.mv[1] <<= 2; + + SHIFT_QMV( me16x8[0] ); SHIFT_QMV( me16x8[1] ); + SHIFT_QMV( me8x16[0] ); SHIFT_QMV( me8x16[1] ); + SHIFT_QMV( me8x8[0] ); SHIFT_QMV( me8x8[1] ); + SHIFT_QMV( me8x8[2] ); SHIFT_QMV( me8x8[3] ); +#undef SHIFT_QMV + */ + +// *(uint64_t*)m->mvp = mvp_bak; + + /* subpel refine */ + if( h->mb.i_subpel_refine >= 3 ) + { + int hpel, qpel; + + /* early termination (when examining multiple reference frames) + * FIXME: this can update fullpel_thresh even if the match + * ref is rejected after subpel refinement */ + if( p_fullpel_thresh ) + { + if( (m->cost*7)>>3 > *p_fullpel_thresh ) + return; + else if( m->cost < *p_fullpel_thresh ) + *p_fullpel_thresh = m->cost; + } + + hpel = subpel_iterations[h->mb.i_subpel_refine][2]; + qpel = subpel_iterations[h->mb.i_subpel_refine][3]; +#if 0 + refine_subpel_multipart( h, l, m, hpel, qpel ); +#else + m->cost_mv = MVD( mvp, bmx, bmy ); + m->cost = h->pixf.mbcmp[m->i_pixel]( m->p_fenc[0], m->i_stride[0], + &p_fref[bmy * m->i_stride[0] + bmx], m->i_stride[0] ) + + m->cost_mv; + if( h->mb.b_chroma_me ) + { + const int bw = x264_pixel_size[m->i_pixel].w; + const int bh = x264_pixel_size[m->i_pixel].h; + DECLARE_ALIGNED( uint8_t, pix[8*8*2], 16 ); + h->mc.mc_chroma( m->p_fref[4], m->i_stride[1], pix, 8, m->mv[0], m->mv[1], bw/2, bh/2 ); + h->mc.mc_chroma( m->p_fref[5], m->i_stride[2], pix+8*8, 8, m->mv[0], m->mv[1], bw/2, bh/2 ); + m->cost += h->pixf.mbcmp[m->i_pixel+3]( m->p_fenc[1], m->i_stride[1], pix, 8 ) + + h->pixf.mbcmp[m->i_pixel+3]( m->p_fenc[2], m->i_stride[1], pix+8*8, 8 ); + } + refine_subpel( h, m, hpel, qpel ); +#endif + } + else + { + /* compute the real cost */ + m->cost_mv = MVD( mvp, bmx, bmy ); + m->cost = h->pixf.mbcmp[m->i_pixel]( m->p_fenc[0], m->i_stride[0], + &p_fref[bmy * m->i_stride[0] + bmx], m->i_stride[0] ) + + m->cost_mv; + if( h->mb.b_chroma_me ) + { + const int bw = x264_pixel_size[m->i_pixel].w; + const int bh = x264_pixel_size[m->i_pixel].h; + DECLARE_ALIGNED( uint8_t, pix[8*8*2], 16 ); + h->mc.mc_chroma( m->p_fref[4], m->i_stride[1], pix, 8, m->mv[0], m->mv[1], bw/2, bh/2 ); + h->mc.mc_chroma( m->p_fref[5], m->i_stride[2], pix+8*8, 8, m->mv[0], m->mv[1], bw/2, bh/2 ); + m->cost += h->pixf.mbcmp[m->i_pixel+3]( m->p_fenc[1], m->i_stride[1], pix, 8 ) + + h->pixf.mbcmp[m->i_pixel+3]( m->p_fenc[2], m->i_stride[1], pix+8*8, 8 ); + } + if( m->i_pixel == PIXEL_16x16 ) + { + l->me16x16.scost = + l->me16x8[0].scost = l->me16x8[1].scost = + l->me8x16[0].scost = l->me8x16[1].scost = + l->me8x8[0].scost = l->me8x8[1].scost = + l->me8x8[2].scost = l->me8x8[3].scost = COST_MAX; + //debug + l->me16x16.smv[0] = l->me16x16.smv[1] = + l->me16x8[0].smv[0] = l->me16x8[1].smv[0] = + l->me8x16[0].smv[0] = l->me8x16[1].smv[0] = + l->me8x8[0].smv[0] = l->me8x8[1].smv[0] = + l->me8x8[2].smv[0] = l->me8x8[3].smv[0] = + l->me16x8[0].smv[1] = l->me16x8[1].smv[1] = + l->me8x16[0].smv[1] = l->me8x16[1].smv[1] = + l->me8x8[0].smv[1] = l->me8x8[1].smv[1] = + l->me8x8[2].smv[1] = l->me8x8[3].smv[1] = 0; + } + } + +#if BENCH + if(sad16_calls > sad_print && sad8_calls > 0) + { + while(sad16_calls > sad_print) sad_print = sad_print * 3/2 + 1; + fprintf(stderr, "cached 16: 0.%03ld =%9ld /%9ld \n", + (sad16_calls-sad16_new)*1000/sad16_calls, sad16_calls-sad16_new, sad16_calls); + fprintf(stderr, "cached 8: 0.%03ld =%9ld /%9ld \n", + (sad8_calls-sad8_new)*1000/sad8_calls, sad8_calls-sad8_new, sad8_calls); + } +#endif +} +#undef COST_MV +#undef MVD + + +#define MVD( mvp, mx, my ) \ + p_cost_mv[ (mx) - (mvp)[0] ] + p_cost_mv[ (my) - (mvp)[1] ] + +/* optimization: no chroma_me for smaller partitions? */ +static int cost_smv_multipart_16x16( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int mx, int my, int bcost ) +{ + DECLARE_ALIGNED( uint8_t, pix[16*16], 16 ); + const int16_t *p_cost_mv = m->p_cost_mv; + int sad[4]; + int i8; + int stride = 16; + uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, 16, 16 ); + +#define x (i8&1) +#define y (i8>>1) + for( i8 = 0; i8 < 4; i8++ ) + sad[i8] = h->pixf.mbcmp[PIXEL_8x8]( &m->p_fenc[0][8*(x+y*m->i_stride[0])], m->i_stride[0], src+8*(x+y*stride), stride ); + if( h->mb.b_chroma_me ) + { + h->mc.mc_chroma( m->p_fref[4], m->i_stride[1], pix, 8, mx, my, 8, 8 ); + h->mc.mc_chroma( m->p_fref[5], m->i_stride[1], pix+64, 8, mx, my, 8, 8 ); + for( i8 = 0; i8 < 4; i8++ ) + sad[i8] += h->pixf.mbcmp[PIXEL_4x4]( &m->p_fenc[1][4*(x+y*m->i_stride[1])], m->i_stride[1], pix +4*(x+y*8), 8 ) + + h->pixf.mbcmp[PIXEL_4x4]( &m->p_fenc[2][4*(x+y*m->i_stride[1])], m->i_stride[1], pix+64+4*(x+y*8), 8 ); + } +#undef x +#undef y + + for( i8 = 0; i8 < 4; i8++ ) + COPY3_IF_LT( l->me8x8[i8].scost, sad[i8] + MVD(l->me8x8[i8].mvp, mx, my), + l->me8x8[i8].smv[0], mx, l->me8x8[i8].smv[1], my ); + + COPY3_IF_LT( l->me16x8[0].scost, sad[0]+sad[1] + MVD(l->me16x8[0].mvp, mx, my), + l->me16x8[0].smv[0], mx, l->me16x8[0].smv[1], my ); + COPY3_IF_LT( l->me16x8[1].scost, sad[2]+sad[3] + MVD(l->me16x8[1].mvp, mx, my), + l->me16x8[1].smv[0], mx, l->me16x8[1].smv[1], my ); + COPY3_IF_LT( l->me8x16[0].scost, sad[0]+sad[2] + MVD(l->me8x16[0].mvp, mx, my), + l->me8x16[0].smv[0], mx, l->me8x16[0].smv[1], my ); + COPY3_IF_LT( l->me8x16[1].scost, sad[1]+sad[3] + MVD(l->me8x16[1].mvp, mx, my), + l->me8x16[1].smv[0], mx, l->me8x16[1].smv[1], my ); + + return sad[0]+sad[1]+sad[2]+sad[3] + MVD(m->mvp, mx, my); +} + +static int cost_smv_multipart_8x8( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int mx, int my, int bcost ) +{ + DECLARE_ALIGNED( uint8_t, pix[16*16], 16 ); + const int16_t *p_cost_mv = m->p_cost_mv; + int sad; + int stride = 8; + const int i_pixel = m->i_pixel; + const int bw = x264_pixel_size[i_pixel].w; + const int bh = x264_pixel_size[i_pixel].h; + uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, bw, bh ); + + sad = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], m->i_stride[0], src, stride ) + + MVD(m->mvp, mx, my); + + if( h->mb.b_chroma_me && sad < bcost ) + { + h->mc.mc_chroma( m->p_fref[4], m->i_stride[1], pix, 8, mx, my, bw/2, bh/2 ); + sad += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], m->i_stride[1], pix, 8 ); + if( sad < bcost ) + { + h->mc.mc_chroma( m->p_fref[5], m->i_stride[1], pix, 8, mx, my, bw/2, bh/2 ); + sad += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], m->i_stride[1], pix, 8 ); + } + } + + return sad; +} + +static int (*const cost_smv_multipart[4])( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int mx, int my, int bcost ) = +{ + cost_smv_multipart_16x16, cost_smv_multipart_8x8, cost_smv_multipart_8x8, cost_smv_multipart_8x8 +}; + +#define COST_MV( mx, my ) \ +{ \ + int cost = cost_smv_multipart[ m->i_pixel ]( h, l, m, mx, my, bcost ); \ + COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \ +} + +static void refine_subpel_multipart( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int hpel_iters, int qpel_iters ) +{ + const int16_t *p_cost_mv = m->p_cost_mv; + int step, i; + int bmx, bmy, bcost; // FIXME gcc warning + + if( m->i_pixel == PIXEL_16x16 ) + { + l->me16x8[0].scost = l->me16x8[1].scost = + l->me8x16[0].scost = l->me8x16[1].scost = + l->me8x8[0].scost = l->me8x8[1].scost = + l->me8x8[2].scost = l->me8x8[3].scost = + bcost = COST_MAX; + } + else + { + bcost = COST_MAX; +// bcost = m->scost; +// bmx = m->smv[0]; +// bmy = m->smv[1]; + } + + COST_MV( m->mv[0], m->mv[1] ); + + if( hpel_iters ) + COST_MV( m->mvp[0], m->mvp[1] ); + + for( step = 2; step >= 1; step-- ) + { + for( i = step>1 ? hpel_iters : qpel_iters; i > 0; i-- ) + { + int omx = bmx; + int omy = bmy; + COST_MV( omx, omy - step ); + COST_MV( omx, omy + step ); + COST_MV( omx - step, omy ); + COST_MV( omx + step, omy ); + if( bmx == omx && bmy == omy ) + break; + } + } + + m->cost = bcost; + m->mv[0] = bmx; + m->mv[1] = bmy; + m->cost_mv = MVD( m->mvp, bmx, bmy ); +} +#undef COST_MV +#undef MVD + Index: encoder/me.h =================================================================== --- encoder/me.h (revision 27) +++ encoder/me.h (working copy) @@ -26,6 +26,22 @@ #define COST_MAX (1<<28) +#define CACHE_FPEL_MAX 0xffff +#define CACHE_FPEL_RADIUS 32 +#define CACHE_FPEL_STRIDE (2*CACHE_FPEL_RADIUS) +#define CACHE_FPEL_SIZE (4*CACHE_FPEL_RADIUS*CACHE_FPEL_RADIUS) +#define CACHE_FPEL_OFFS(x,y) ((x)+CACHE_FPEL_RADIUS+((y)+CACHE_FPEL_RADIUS)*CACHE_FPEL_STRIDE) +#if 0 +#define CACHE_FPEL_IN_RANGE(x,y) ((x) >= -CACHE_FPEL_RADIUS && \ + (x) < CACHE_FPEL_RADIUS && \ + (y) >= -CACHE_FPEL_RADIUS && \ + (y) < CACHE_FPEL_RADIUS) +#else +// assumes radius is a power of 2 +#define CACHE_FPEL_IN_RANGE(x,y) (!( ((x)+CACHE_FPEL_RADIUS) & ((-CACHE_FPEL_RADIUS)<<1) \ + || ((y)+CACHE_FPEL_RADIUS) & ((-CACHE_FPEL_RADIUS)<<1) )) +#endif + typedef struct { /* input */ @@ -42,12 +58,17 @@ int cost_mv; /* lambda * nbits for the chosen mv */ int cost; /* satd + lambda * nbits */ int mv[2]; + + /* preliminary subpel analysis, piggybacking on larger block searches */ + int scost; + int smv[2]; } x264_me_t; typedef struct { /* 16x16 */ int i_ref; + uint8_t *p_cache_fpel; /* bitmap of which mvs have been tried */ x264_me_t me16x16; /* 8x8 */ @@ -79,6 +100,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_fullpel_thresh ); static inline void x264_me_search( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc ) { x264_me_search_ref( h, m, mvc, i_mvc, NULL ); } +void x264_me_search_multipart( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_fullpel_thresh ); void x264_me_refine_qpel( x264_t *h, x264_me_t *m ); Index: encoder/analyse.c =================================================================== --- encoder/analyse.c (revision 27) +++ encoder/analyse.c (working copy) @@ -156,6 +156,12 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) { + //FIXME move to common/macroblock.c + int i; + if( !h->mb.cache_fpel[0] ) + for( i = 0; i < 3; i++ ) + h->mb.cache_fpel[i] = x264_malloc( CACHE_FPEL_SIZE ); + memset( a, 0, sizeof( x264_mb_analysis_t ) ); /* conduct the analysis using this lamda and QP */ @@ -641,6 +647,124 @@ (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \ (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; +static void init_fenc( x264_t *h, x264_mb_analysis_t *a, x264_mb_analysis_list_t *l ) +{ + LOAD_FENC( &l->me16x16, h->mb.pic.p_fenc, 0, 0 ); + LOAD_FENC( &l->me16x8[0], h->mb.pic.p_fenc, 0, 0 ); + LOAD_FENC( &l->me16x8[1], h->mb.pic.p_fenc, 0, 8 ); + LOAD_FENC( &l->me8x16[0], h->mb.pic.p_fenc, 0, 0 ); + LOAD_FENC( &l->me8x16[1], h->mb.pic.p_fenc, 8, 0 ); + LOAD_FENC( &l->me8x8[0], h->mb.pic.p_fenc, 0, 0 ); + LOAD_FENC( &l->me8x8[1], h->mb.pic.p_fenc, 8, 0 ); + LOAD_FENC( &l->me8x8[2], h->mb.pic.p_fenc, 0, 8 ); + LOAD_FENC( &l->me8x8[3], h->mb.pic.p_fenc, 8, 8 ); + l->me16x16.i_pixel = PIXEL_16x16; + l->me16x8[0].i_pixel = l->me16x8[1].i_pixel = PIXEL_16x8; + l->me8x16[0].i_pixel = l->me8x16[1].i_pixel = PIXEL_8x16; + l->me8x8[0].i_pixel = l->me8x8[1].i_pixel = + l->me8x8[2].i_pixel = l->me8x8[3].i_pixel = PIXEL_8x8; + l->me16x16.p_cost_mv = + l->me16x8[0].p_cost_mv = l->me16x8[1].p_cost_mv = + l->me8x16[0].p_cost_mv = l->me8x16[1].p_cost_mv = + l->me8x8[0].p_cost_mv = l->me8x8[1].p_cost_mv = + l->me8x8[2].p_cost_mv = l->me8x8[3].p_cost_mv = a->p_cost_mv; +} + +static void init_mvps( x264_t *h, x264_mb_analysis_list_t *l, int i_ref ) +{ + int i; + l->i_ref = i_ref; + LOAD_HPELS( &l->me16x16, h->mb.pic.p_fref[0][i_ref], 0, 0 ); + LOAD_HPELS( &l->me16x8[0], h->mb.pic.p_fref[0][i_ref], 0, 0 ); + LOAD_HPELS( &l->me16x8[1], h->mb.pic.p_fref[0][i_ref], 0, 8 ); + LOAD_HPELS( &l->me8x16[0], h->mb.pic.p_fref[0][i_ref], 0, 0 ); + LOAD_HPELS( &l->me8x16[1], h->mb.pic.p_fref[0][i_ref], 8, 0 ); + LOAD_HPELS( &l->me8x8[0], h->mb.pic.p_fref[0][i_ref], 0, 0 ); + LOAD_HPELS( &l->me8x8[1], h->mb.pic.p_fref[0][i_ref], 8, 0 ); + LOAD_HPELS( &l->me8x8[2], h->mb.pic.p_fref[0][i_ref], 0, 8 ); + LOAD_HPELS( &l->me8x8[3], h->mb.pic.p_fref[0][i_ref], 8, 8 ); + + x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, i_ref ); + x264_mb_predict_mv_16x16( h, 0, i_ref, l->me16x16.mvp ); + + x264_mb_predict_mv( h, 0, 0, 4, l->me16x8[0].mvp ); + x264_macroblock_cache_mv( h, 0, 0, 4, 2, 0, l->me16x8[0].mvp[0], l->me16x8[0].mvp[1] ); + x264_mb_predict_mv( h, 0, 8, 4, l->me16x8[1].mvp ); + + x264_mb_predict_mv( h, 0, 0, 2, l->me8x16[0].mvp ); + x264_macroblock_cache_mv( h, 0, 0, 2, 4, 0, l->me8x16[0].mvp[0], l->me8x16[0].mvp[1] ); + x264_mb_predict_mv( h, 0, 4, 2, l->me8x16[1].mvp ); + + for( i = 0; i < 4; i++ ) + { + int *mvp = l->me8x8[i].mvp; + x264_mb_predict_mv( h, 0, 4*i, 2, mvp ); + x264_macroblock_cache_mv( h, 2*(i&1), i&2, 2, 2, 0, mvp[0], mvp[1] ); + } +} + +static void x264_mb_analyse_inter_p16x16_multipart( x264_t *h, x264_mb_analysis_t *a ) +{ + x264_mb_analysis_list_t l; + int i_ref; + int mvc[4][2], i_mvc; + int i_fullpel_thresh = INT_MAX; + int *p_fullpel_thresh = h->i_ref0>1 ? &i_fullpel_thresh : NULL; + //FIXME cleanup + void *cache_best = h->mb.cache_fpel[0], *cache_cur = h->mb.cache_fpel[2], *cache_tmp; + + init_fenc( h, a, &l ); + + a->l0.me16x16.cost = INT_MAX; + for( i_ref = 0; i_ref < h->i_ref0; i_ref++ ) + { + const int i_ref_cost = a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref ); + i_fullpel_thresh -= i_ref_cost; + + memset( cache_cur, 0, CACHE_FPEL_SIZE ); + l.p_cache_fpel = cache_cur; + init_mvps( h, &l, i_ref ); + x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc ); + x264_me_search_multipart( h, &l, &l.me16x16, mvc, i_mvc, p_fullpel_thresh ); + + l.me16x16.cost += i_ref_cost; + i_fullpel_thresh += i_ref_cost; + + if( l.me16x16.cost < a->l0.me16x16.cost ) + { + a->l0 = l; + cache_tmp = cache_cur; + cache_cur = cache_best; + cache_best = cache_tmp; + } + + /* save mv for predicting neighbors */ + h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0] = l.me16x16.mv[0]; + h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = l.me16x16.mv[1]; + } + + /* Set global ref, needed for all others modes */ + x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref ); +// fprintf(stderr, "\np16x16 r%d: (%d,%d)\n", a->l0.i_ref, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1]); + + h->mb.cache_fpel[0] = cache_best; + h->mb.cache_fpel[2] = cache_cur; + + if( a->b_mbrd ) + { + a->i_best_satd = a->l0.me16x16.cost; + h->mb.i_type = P_L0; + h->mb.i_partition = D_16x16; + x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] ); + a->l0.me16x16.cost = x264_rd_cost_mb( h, a->i_lambda2 ); + } + else + { + /* subtract ref cost, so we don't have to add it for the other P types */ + a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref ); + } +} + static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) { x264_me_t m; @@ -708,6 +832,7 @@ /* XXX Needed for x264_mb_predict_mv */ h->mb.i_partition = D_8x8; + /* FIXME shouldn't need predictors from large block sizes */ i_mvc = 1; mvc[0][0] = a->l0.me16x16.mv[0]; mvc[0][1] = a->l0.me16x16.mv[1]; @@ -718,6 +843,9 @@ const int x8 = i%2; const int y8 = i/2; +// fprintf(stderr, "p8x8[%d]: (%d,%d) -> (%d,%d) -> (%d,%d) ->", +// i, m->mvp[0], m->mvp[1], m->mv[0], m->mv[1], m->smv[0], m->smv[1]); + m->i_pixel = PIXEL_8x8; m->p_cost_mv = a->p_cost_mv; @@ -725,8 +853,10 @@ LOAD_HPELS( m, p_fref, 8*x8, 8*y8 ); x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp ); - x264_me_search( h, m, mvc, i_mvc ); + x264_me_search_multipart( h, &a->l0, m, mvc, i_mvc, NULL ); + //x264_me_search( h, m, mvc, i_mvc ); +// fprintf(stderr, " (%d,%d)\n", m->mv[0], m->mv[1]); x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, m->mv[0], m->mv[1] ); mvc[i_mvc][0] = m->mv[0]; @@ -764,6 +894,9 @@ { x264_me_t *m = &a->l0.me16x8[i]; +// fprintf(stderr, "p16x8[%d]: (%d,%d) -> (%d,%d) -> (%d,%d) ->", +// i, m->mvp[0], m->mvp[1], m->mv[0], m->mv[1], m->smv[0], m->smv[1]); + m->i_pixel = PIXEL_16x8; m->p_cost_mv = a->p_cost_mv; @@ -776,8 +909,10 @@ mvc[1][1] = a->l0.me8x8[2*i+1].mv[1]; x264_mb_predict_mv( h, 0, 8*i, 4, m->mvp ); - x264_me_search( h, m, mvc, 2 ); + x264_me_search_multipart( h, &a->l0, m, NULL, 0, NULL ); + //x264_me_search( h, m, mvc, 2 ); +// fprintf(stderr, " (%d,%d)\n", m->mv[0], m->mv[1]); x264_macroblock_cache_mv( h, 0, 2*i, 4, 2, 0, m->mv[0], m->mv[1] ); } @@ -805,6 +940,9 @@ { x264_me_t *m = &a->l0.me8x16[i]; +// fprintf(stderr, "p8x16[%d]: (%d,%d) -> (%d,%d) -> (%d,%d) ->", +// i, m->mvp[0], m->mvp[1], m->mv[0], m->mv[1], m->smv[0], m->smv[1]); + m->i_pixel = PIXEL_8x16; m->p_cost_mv = a->p_cost_mv; @@ -817,8 +955,10 @@ mvc[1][1] = a->l0.me8x8[i+2].mv[1]; x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp ); - x264_me_search( h, m, mvc, 2 ); + x264_me_search_multipart( h, &a->l0, m, NULL, 0, NULL ); + //x264_me_search( h, m, mvc, 2 ); +// fprintf(stderr, " (%d,%d)\n", m->mv[0], m->mv[1]); x264_macroblock_cache_mv( h, 2*i, 0, 2, 4, 0, m->mv[0], m->mv[1] ); } @@ -1547,9 +1687,16 @@ x264_mb_analyse_load_costs( h, &analysis ); - x264_mb_analyse_inter_p16x16( h, &analysis ); if( flags & X264_ANALYSE_PSUB16x16 ) + { + x264_mb_analyse_inter_p16x16_multipart( h, &analysis ); + //x264_mb_analyse_inter_p16x16( h, &analysis ); + //x264_mb_analyse_inter_p16x8( h, &analysis ); + //x264_mb_analyse_inter_p8x16( h, &analysis ); x264_mb_analyse_inter_p8x8( h, &analysis ); + } + else + x264_mb_analyse_inter_p16x16( h, &analysis ); /* Select best inter mode */ i_type = P_L0; Index: common/common.h =================================================================== --- common/common.h (revision 27) +++ common/common.h (working copy) @@ -357,6 +357,9 @@ int8_t *skipbp; /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */ int8_t *mb_transform_size; /* transform_size_8x8_flag of each mb */ + /* motion estimation state */ + uint16_t *cache_fpel[3]; /* map mv -> 8x8 sad score. [L0, L1, tmp] */ + /* current value */ int i_type; int i_partition;