Index: encoder/analyse.h =================================================================== --- encoder/analyse.h (revision 157) +++ encoder/analyse.h (working copy) @@ -26,5 +26,6 @@ void x264_macroblock_analyse( x264_t *h ); void x264_slicetype_decide( x264_t *h ); +void x264_weighted_pred_init( x264_t *h, int i_slice_type ); #endif Index: encoder/macroblock.c =================================================================== --- encoder/macroblock.c (revision 157) +++ encoder/macroblock.c (working copy) @@ -507,34 +507,6 @@ } /***************************************************************************** - * x264_macroblock_encode_pskip: - * Encode an already marked skip block - *****************************************************************************/ -void x264_macroblock_encode_pskip( x264_t *h ) -{ - const int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0], - h->mb.mv_min[0], h->mb.mv_max[0] ); - const int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1], - h->mb.mv_min[1], h->mb.mv_max[1] ); - - /* Motion compensation XXX probably unneeded */ - h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0], - h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0], - mvx, mvy, 16, 16 ); - - /* Chroma MC */ - h->mc.mc_chroma( h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], - h->mb.pic.p_fdec[1], h->mb.pic.i_stride[1], - mvx, mvy, 8, 8 ); - - h->mc.mc_chroma( h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2], - h->mb.pic.p_fdec[2], h->mb.pic.i_stride[2], - mvx, mvy, 8, 8 ); - - x264_macroblock_encode_skip( h ); -} - -/***************************************************************************** * x264_macroblock_encode: *****************************************************************************/ void x264_macroblock_encode( x264_t *h ) @@ -545,13 +517,13 @@ if( h->mb.i_type == P_SKIP ) { - /* A bit special */ - x264_macroblock_encode_pskip( h ); + /* XXX motion compensation is probably unneeded: it was done during analysis */ + x264_mb_mc_pskip( h ); + x264_macroblock_encode_skip( h ); return; } if( h->mb.i_type == B_SKIP ) { - /* XXX motion compensation is probably unneeded */ x264_mb_mc( h ); x264_macroblock_encode_skip( h ); return; @@ -806,6 +778,15 @@ h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0], h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0], mvp[0], mvp[1], 16, 16 ); + + /* Weighted prediction */ + if( h->sh.b_wpred[0][0] ) + { + h->pixf.weight[ PIXEL_16x16 ]( + h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0], + h->sh.i_wpred_log2_denom[0], h->sh.i_wpred_scale[0][0], + h->sh.i_wpred_offset[0][0] ); + } } /* get luma diff */ Index: encoder/slicetype_decision.c =================================================================== --- encoder/slicetype_decision.c (revision 157) +++ encoder/slicetype_decision.c (working copy) @@ -28,10 +28,15 @@ #include "common/common.h" #include "common/macroblock.h" #include "common/cpu.h" +#include "common/clip1.h" #include "macroblock.h" #include "me.h" +/**************************************************************************** + * B-frame placement functions + ****************************************************************************/ + static void x264_mb_analyse_load_costs_lowres( x264_t *h, x264_mb_analysis_t *a ) { static int16_t *p_cost_mv; @@ -405,3 +410,181 @@ frm->i_type = X264_TYPE_B; } } + + +/**************************************************************************** + * Weighted prediction functions + ****************************************************************************/ + +static void weight_analyse_plane( x264_t *h, uint8_t *pix, int i_pix_stride, uint8_t *ref, int i_ref_stride, + int i_width, int i_height, int *i_log2_denom, + int *wpred, int *wpred_scale, int *wpred_offset ) +{ + int x, y; + int64_t S0=0, S1=0, S00=0, S11=0, S01=0; + int n = i_width*i_height; + float scale, offset, mult; + float err_old, err_affine, err_mult, err_quant; + + for( y = 0; y < i_height; y++ ) + { + for( x = 0; x < i_width; x++ ) + { + const uint8_t e0 = ref[y*i_ref_stride+x]; + const uint8_t e1 = pix[y*i_ref_stride+x]; + S0 += e0; + S1 += e1; + S00 += e0*e0; + S01 += e0*e1; + S11 += e1*e1; + } + } + + x264_cpu_restore( h->param.cpu ); + + if( S00 == 0 || S0*S0 - S00*n == 0 || S11 + S00 - 2*S01 == 0 ) + return; + + /* calculate scale and offset to minimize SSD */ + scale = (float)(S0*S1 - S01*n) / (S0*S0 - S00*n); + offset = (float)(S1 - scale*S0) / n; + err_old = (float)(S11 + S00 - 2*S01) / n; + err_affine = (S11 + scale*scale*S00 + offset*offset*n + 2*scale*offset*S0 - 2*scale*S01 - 2*offset*S1) / n; + + mult = (float)S01/S00; + err_mult = (S00*mult*mult - 2*S01*mult + S11) / n; + +// fprintf( stderr, "S0=%lld S1=%lld S00=%lld S01=%lld S11=%lld n=%d \n", +// S0, S1, S00, S01, S11, n ); +// fprintf( stderr, "mul=%.5f affine=*%.5f%+.5f Eold=%.5f Enew=%.5f Emul=%.5f rat=%.5f \n", +// mult, scale, offset, err_old, err_affine, err_mult, err_affine/err_old ); + +#if 0 + if( (mult > 1.03 || mult < 0.98) + && mult > 0.5 && mult < 2.0 + && err_mult/err_old < 0.5 + && err_old > 1.0 + && err_mult < 200.0 ) + { + /* FIXME: chroma planes share a log2_denom */ + if( mult < 1. ) + *i_log2_denom = 7; + else + *i_log2_denom = 6; + + *wpred = 1; + *wpred_scale = (int)(mult * (1<<*i_log2_denom) + .5); + *wpred_offset = 0; + + // + float multq = (float)*wpred_scale / (1<<*i_log2_denom); + err_quant = (S00*multq*multq - 2*S01*multq + S11) / n; + + fprintf( stderr, "wpred=1 trans = %d/%d %c %d, Eold=%.4f Emul=%.4f Equant=%.4f \n", + *wpred_scale, 1<<*i_log2_denom, *wpred_offset>0 ? '+' : '-', abs(*wpred_offset), + err_old, err_mult, err_quant ); + } +#else + if( (scale > 1.03 || scale < 0.98 || offset > 2 || offset < -2) + && scale > 0.5 && scale < 2.0 + && err_affine/err_old < 0.5 + && err_old > 1.0 + && err_affine < 200.0 ) + { + /* FIXME: chroma planes share a log2_denom */ + if( scale < 1. ) + *i_log2_denom = 7; + else + *i_log2_denom = 6; + + *wpred = 1; + *wpred_scale = (int)(scale * (1<<*i_log2_denom) + .5); + *wpred_offset = (int)(offset + .5); + + // + float multq = (float)*wpred_scale / (1<<*i_log2_denom); + float addq = (float)*wpred_offset; + err_quant = (S11 + multq*multq*S00 + addq*addq*n + 2*multq*addq*S0 - 2*multq*S01 - 2*addq*S1) / n; + + fprintf( stderr, "wpred=1 trans = %d/%d %c %d, Eold=%.4f Enew=%.4f Equant=%.4f \n", + *wpred_scale, 1<<*i_log2_denom, *wpred_offset>0 ? '+' : '-', abs(*wpred_offset), + err_old, err_affine, err_quant ); + } +#endif + else + *i_log2_denom = 0; +} + +static void weight_scale_plane( uint8_t *pix, int i_pix_stride, uint8_t *ref, int i_ref_stride, + int i_width, int i_height, int i_log2_denom, + int wpred_scale, int wpred_offset ) +{ + uint8_t transform[256]; + int x, y, i; + + for( i = 0; i < 256; i++ ) + { + int v; + if( i_log2_denom ) + v = ((i * wpred_scale + (1 << (i_log2_denom-1))) >> i_log2_denom) + wpred_offset; + else + v = i * wpred_scale + wpred_offset; + transform[i] = x264_clip_uint8( v ); + } + + for( y = 0; y < i_height; y++ ) + for( x = 0; x < i_width; x++ ) + pix[y*i_pix_stride+x] = transform[ ref[y*i_ref_stride+x] ]; +} + +void x264_weighted_pred_init( x264_t *h, int i_slice_type ) +{ + int i_ref; + int i, c; + + /* the rest of the refs have wpred initted to 0 and never changed */ + for( i_ref = 0; i_ref < h->i_ref0; i_ref++ ) + for( i = 0; i < 4; i++ ) + h->fref0[i_ref]->weighted[i] = h->fref0[i_ref]->filtered[i]; + for( i_ref = 0; i_ref < h->i_ref1; i_ref++ ) + for( i = 0; i < 4; i++ ) + h->fref1[i_ref]->weighted[i] = h->fref1[i_ref]->filtered[i]; + memset( h->sh.b_wpred, 0, sizeof(h->sh.b_wpred) ); + + if( i_slice_type != SLICE_TYPE_P ) + return; + + /* calculate weights for ref0 */ + for( c = 0; c < 1 /*3*/; c++ ) + { + int *log_denom = &h->sh.i_wpred_log2_denom[c]; + int *wpred = &h->sh.b_wpred[0][c]; + int *scale = &h->sh.i_wpred_scale[0][c]; + int *offset = &h->sh.i_wpred_offset[0][c]; + *log_denom = 6; + *wpred = 0; + *scale = 1<<6; + *offset = 0; + + weight_analyse_plane( h, h->fenc->plane[c], h->fenc->i_stride[c], + h->fref0[0]->plane[c], h->fref0[0]->i_stride[c], + h->param.i_width >> !!c, h->param.i_height >> !!c, + log_denom, wpred, scale, offset ); + } + + /* filter the luma plane for motion estimation */ + if( h->sh.b_wpred[0][0] ) + { + for( i = 0; i < 4; i++ ) + { + h->fref0[0]->weighted[i] = h->mb.p_weight_buf[i] + h->fref0[0]->i_stride[0] * 32 + 32; + + weight_scale_plane( h->fref0[0]->weighted[i], h->fref0[0]->i_stride[c], + h->fref0[0]->filtered[i], h->fref0[0]->i_stride[c], + h->param.i_width, h->param.i_height, + h->sh.i_wpred_log2_denom[0], + h->sh.i_wpred_scale[0][0], h->sh.i_wpred_offset[0][0] ); + } + } +} + Index: encoder/encoder.c =================================================================== --- encoder/encoder.c (revision 157) +++ encoder/encoder.c (working copy) @@ -277,9 +277,34 @@ } } - if( ( sh->pps->b_weighted_pred && ( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_SP ) ) || - ( sh->pps->b_weighted_bipred == 1 && sh->i_type == SLICE_TYPE_B ) ) + if( sh->pps->b_weighted_pred && ( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_SP ) ) { + bs_write_ue( s, sh->i_wpred_log2_denom[0] ); + bs_write_ue( s, sh->i_wpred_log2_denom[1] ); + for( i = 0; i < sh->i_num_ref_idx_l0_active; i++ ) + { + int luma_weight_l0_flag = sh->b_wpred[i][0]; + int chroma_weight_l0_flag = sh->b_wpred[i][1] || sh->b_wpred[i][2]; + bs_write1( s, luma_weight_l0_flag ); + if( luma_weight_l0_flag ) + { + bs_write_se( s, sh->i_wpred_scale[i][0] ); + bs_write_se( s, sh->i_wpred_offset[i][0] ); + } + bs_write1( s, chroma_weight_l0_flag ); + if( chroma_weight_l0_flag ) + { + int j; + for( j = 1; j < 3; j++ ) + { + bs_write_se( s, sh->i_wpred_scale[i][j] ); + bs_write_se( s, sh->i_wpred_offset[i][j] ); + } + } + } + } + else if( sh->pps->b_weighted_bipred == 1 && sh->i_type == SLICE_TYPE_B ) + { /* FIXME */ } @@ -340,6 +365,8 @@ x264_t *h = x264_malloc( sizeof( x264_t ) ); int i, i_slice; + memset( h, 0, sizeof( x264_t ) ); + /* Create a copy of param */ memcpy( &h->param, param, sizeof( x264_param_t ) ); if( h->param.rc.psz_stat_out ) @@ -1127,6 +1154,8 @@ /* build ref list 0/1 */ x264_reference_build_list( h, h->fdec->i_poc, i_slice_type ); + if( h->param.analyse.b_weighted_pred ) + x264_weighted_pred_init( h, i_slice_type ); if( i_slice_type == SLICE_TYPE_B ) x264_macroblock_bipred_init( h ); Index: encoder/set.c =================================================================== --- encoder/set.c (revision 157) +++ encoder/set.c (working copy) @@ -293,7 +293,7 @@ pps->i_num_ref_idx_l0_active = 1; pps->i_num_ref_idx_l1_active = 1; - pps->b_weighted_pred = 0; + pps->b_weighted_pred = param->analyse.b_weighted_pred ? 1 : 0; pps->b_weighted_bipred = param->analyse.b_weighted_bipred ? 2 : 0; pps->i_pic_init_qp = 26; Index: encoder/analyse.c =================================================================== --- encoder/analyse.c (revision 157) +++ encoder/analyse.c (working copy) @@ -541,7 +541,7 @@ i_fullpel_thresh -= i_ref_cost; /* search with ref */ - LOAD_HPELS( m.p_fref, h->mb.pic.p_fref[0][i_ref], 0 ); + LOAD_HPELS( m.p_fref, h->mb.pic.p_fref_w[0][i_ref], 0 ); x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp ); x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc ); x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh ); @@ -569,7 +569,7 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a ) { - uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref]; + uint8_t **p_fref = h->mb.pic.p_fref_w[0][a->l0.i_ref]; uint8_t *p_fenc = h->mb.pic.p_fenc[0]; int mvc[5][2], i_mvc; int i; @@ -613,7 +613,7 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a ) { - uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref]; + uint8_t **p_fref = h->mb.pic.p_fref_w[0][a->l0.i_ref]; uint8_t *p_fenc = h->mb.pic.p_fenc[0]; int mvc[2][2]; int i; @@ -648,7 +648,7 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a ) { - uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref]; + uint8_t **p_fref = h->mb.pic.p_fref_w[0][a->l0.i_ref]; uint8_t *p_fenc = h->mb.pic.p_fenc[0]; int mvc[2][2]; int i; @@ -683,7 +683,7 @@ static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 ) { - uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref]; + uint8_t **p_fref = h->mb.pic.p_fref_w[0][a->l0.i_ref]; uint8_t *p_fenc = h->mb.pic.p_fenc[0]; int i4x4; @@ -722,7 +722,7 @@ static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 ) { - uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref]; + uint8_t **p_fref = h->mb.pic.p_fref_w[0][a->l0.i_ref]; uint8_t *p_fenc = h->mb.pic.p_fenc[0]; int i8x4; @@ -758,7 +758,7 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 ) { - uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref]; + uint8_t **p_fref = h->mb.pic.p_fref_w[0][a->l0.i_ref]; uint8_t *p_fenc = h->mb.pic.p_fenc[0]; int i4x8; @@ -1008,20 +1008,23 @@ } #undef CACHE_MV_BI +#define INIT_FREF \ + uint8_t *p_fref[2][4] = \ + { { \ + h->mb.pic.p_fref[0][a->l0.i_ref][0], \ + h->mb.pic.p_fref[0][a->l0.i_ref][1], \ + h->mb.pic.p_fref[0][a->l0.i_ref][2], \ + h->mb.pic.p_fref[0][a->l0.i_ref][3] \ + }, { \ + h->mb.pic.p_fref[1][a->l1.i_ref][0], \ + h->mb.pic.p_fref[1][a->l1.i_ref][1], \ + h->mb.pic.p_fref[1][a->l1.i_ref][2], \ + h->mb.pic.p_fref[1][a->l1.i_ref][3] \ + } }; + static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a ) { - uint8_t *p_fref[2][4] = - { { - h->mb.pic.p_fref[0][a->l0.i_ref][0], - h->mb.pic.p_fref[0][a->l0.i_ref][1], - h->mb.pic.p_fref[0][a->l0.i_ref][2], - h->mb.pic.p_fref[0][a->l0.i_ref][3] - }, { - h->mb.pic.p_fref[1][a->l1.i_ref][0], - h->mb.pic.p_fref[1][a->l1.i_ref][1], - h->mb.pic.p_fref[1][a->l1.i_ref][2], - h->mb.pic.p_fref[1][a->l1.i_ref][3] - } }; + INIT_FREF; uint8_t *p_fenc = h->mb.pic.p_fenc[0]; uint8_t pix[2][8*8]; int i, l; @@ -1098,18 +1101,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a ) { - uint8_t *p_fref[2][4] = - { { - h->mb.pic.p_fref[0][a->l0.i_ref][0], - h->mb.pic.p_fref[0][a->l0.i_ref][1], - h->mb.pic.p_fref[0][a->l0.i_ref][2], - h->mb.pic.p_fref[0][a->l0.i_ref][3] - }, { - h->mb.pic.p_fref[1][a->l1.i_ref][0], - h->mb.pic.p_fref[1][a->l1.i_ref][1], - h->mb.pic.p_fref[1][a->l1.i_ref][2], - h->mb.pic.p_fref[1][a->l1.i_ref][3] - } }; + INIT_FREF; uint8_t *p_fenc = h->mb.pic.p_fenc[0]; uint8_t pix[2][16*8]; int i_ref_stride = h->mb.pic.i_stride[0]; @@ -1182,18 +1174,7 @@ } static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a ) { - uint8_t *p_fref[2][4] = - { { - h->mb.pic.p_fref[0][a->l0.i_ref][0], - h->mb.pic.p_fref[0][a->l0.i_ref][1], - h->mb.pic.p_fref[0][a->l0.i_ref][2], - h->mb.pic.p_fref[0][a->l0.i_ref][3] - }, { - h->mb.pic.p_fref[1][a->l1.i_ref][0], - h->mb.pic.p_fref[1][a->l1.i_ref][1], - h->mb.pic.p_fref[1][a->l1.i_ref][2], - h->mb.pic.p_fref[1][a->l1.i_ref][3] - } }; + INIT_FREF; uint8_t *p_fenc = h->mb.pic.p_fenc[0]; uint8_t pix[2][8*16]; int i_ref_stride = h->mb.pic.i_stride[0]; Index: x264.c =================================================================== --- x264.c (revision 157) +++ x264.c (working copy) @@ -141,7 +141,8 @@ " - none, all\n" " --direct Direct MV prediction mode [\"temporal\"]\n" " - none, spatial, temporal\n" - " -w, --weightb Weighted prediction for B-frames\n" + " --weightb Weighted prediction for B-frames\n" + " --weightp Weighted prediction for P-frames\n" " -m, --subme Subpixel motion estimation quality: 1=fast, 5=best. [%d]\n" "\n" " --level Specify IDC level\n" @@ -224,6 +225,8 @@ #define OPT_NOBADAPT 277 #define OPT_BBIAS 278 #define OPT_BPYRAMID 279 +#define OPT_WEIGHTP 280 +#define OPT_WEIGHTB 281 static struct option long_options[] = { @@ -251,7 +254,8 @@ { "output", required_argument, NULL, 'o' }, { "analyse", required_argument, NULL, 'A' }, { "direct", required_argument, NULL, OPT_DIRECT }, - { "weightb", no_argument, NULL, 'w' }, + { "weightp", no_argument, NULL, OPT_WEIGHTP }, + { "weightb", no_argument, NULL, OPT_WEIGHTB }, { "subme", required_argument, NULL, 'm' }, { "level", required_argument, NULL, OPT_LEVEL }, { "rcsens", required_argument, NULL, OPT_RCSENS }, @@ -273,7 +277,7 @@ int c; - c = getopt_long( argc, argv, "hi:I:b:r:cxB:q:nf:o:s:A:m:p:vw", + c = getopt_long( argc, argv, "hi:I:b:r:cxB:q:nf:o:s:A:m:p:v", long_options, &long_options_index); if( c == -1 ) @@ -407,7 +411,10 @@ else param->analyse.i_direct_mv_pred = atoi( optarg ); break; - case 'w': + case OPT_WEIGHTP: + param->analyse.b_weighted_pred = 1; + break; + case OPT_WEIGHTB: param->analyse.b_weighted_bipred = 1; break; case 'm': Index: common/frame.h =================================================================== --- common/frame.h (revision 157) +++ common/frame.h (working copy) @@ -43,6 +43,7 @@ int i_lines_lowres; uint8_t *plane[4]; uint8_t *filtered[4]; /* plane[0], H, V, HV */ + uint8_t *weighted[4]; /* 0, H, V, HV */ uint8_t *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */ /* for unrestricted mv we allocate more data than needed Index: common/macroblock.c =================================================================== --- common/macroblock.c (revision 157) +++ common/macroblock.c (working copy) @@ -567,9 +567,16 @@ const int mvy = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ); h->mc.mc_luma( h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0], - &h->mb.pic.p_fdec[0][4*y * h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0], + &h->mb.pic.p_fdec[0][4*y * h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0], mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height ); + if( h->sh.b_wpred[i_ref][0] ) + { + h->pixf.weight[ x264_size2pixel[height][width] ]( + &h->mb.pic.p_fdec[0][4*y * h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0], + h->sh.i_wpred_log2_denom[0], h->sh.i_wpred_scale[i_ref][0], h->sh.i_wpred_offset[i_ref][0] ); + } + h->mc.mc_chroma( &h->mb.pic.p_fref[0][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], &h->mb.pic.p_fdec[1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], mvx, mvy, 2*width, 2*height ); @@ -820,6 +827,34 @@ } } +void x264_mb_mc_pskip( x264_t *h ) +{ + const int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0], + h->mb.mv_min[0], h->mb.mv_max[0] ); + const int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1], + h->mb.mv_min[1], h->mb.mv_max[1] ); + + h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0], + h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0], + mvx, mvy, 16, 16 ); + + if( h->sh.b_wpred[0][0] ) + { + h->pixf.weight[ PIXEL_16x16 ]( + h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0], + h->sh.i_wpred_log2_denom[0], h->sh.i_wpred_scale[0][0], + h->sh.i_wpred_offset[0][0] ); + } + + h->mc.mc_chroma( h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], + h->mb.pic.p_fdec[1], h->mb.pic.i_stride[1], + mvx, mvy, 8, 8 ); + + h->mc.mc_chroma( h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2], + h->mb.pic.p_fdec[2], h->mb.pic.i_stride[2], + mvx, mvy, 8, 8 ); +} + void x264_macroblock_cache_init( x264_t *h ) { int i, j; @@ -937,25 +972,33 @@ /* load picture pointers */ for( i = 0; i < 3; i++ ) { - const int w = (i == 0 ? 16 : 8); const int i_stride = h->fdec->i_stride[i]; + const int i_xy = (i == 0 ? 16 : 8) * ( i_mb_x + i_mb_y * i_stride ); int j; h->mb.pic.i_stride[i] = i_stride; - h->mb.pic.p_fenc[i] = &h->fenc->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )]; + h->mb.pic.p_fenc[i] = &h->fenc->plane[i][i_xy]; + h->mb.pic.p_fdec[i] = &h->fdec->plane[i][i_xy]; - h->mb.pic.p_fdec[i] = &h->fdec->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )]; - for( j = 0; j < h->i_ref0; j++ ) + h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &h->fref0[j]->plane[i][i_xy]; + for( j = 0; j < h->i_ref1; j++ ) + h->mb.pic.p_fref[1][j][i==0 ? 0:i+3] = &h->fref1[j]->plane[i][i_xy]; + } + for( i = 0; i < 4; i++ ) + { + const int i_xy = 16 * ( i_mb_x + i_mb_y * h->fdec->i_stride[0] ); + int j; + for( j = 0; j < h->i_ref0; j++ ) { - h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &h->fref0[j]->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )]; - h->mb.pic.p_fref[0][j][i+1] = &h->fref0[j]->filtered[i+1][ 16 * ( i_mb_x + i_mb_y * h->fdec->i_stride[0] )]; + h->mb.pic.p_fref[0][j][i] = &h->fref0[j]->filtered[i][i_xy]; + h->mb.pic.p_fref_w[0][j][i] = &h->fref0[j]->weighted[i][i_xy]; } for( j = 0; j < h->i_ref1; j++ ) { - h->mb.pic.p_fref[1][j][i==0 ? 0:i+3] = &h->fref1[j]->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )]; - h->mb.pic.p_fref[1][j][i+1] = &h->fref1[j]->filtered[i+1][ 16 * ( i_mb_x + i_mb_y * h->fdec->i_stride[0] )]; + h->mb.pic.p_fref[1][j][i] = &h->fref1[j]->filtered[i][i_xy]; + h->mb.pic.p_fref_w[1][j][i] = &h->fref1[j]->weighted[i][i_xy]; } } Index: common/macroblock.h =================================================================== --- common/macroblock.h (revision 157) +++ common/macroblock.h (working copy) @@ -195,6 +195,7 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale ); void x264_mb_mc( x264_t *h ); +void x264_mb_mc_pskip( x264_t *h ); static inline void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, int ref ) Index: common/pixel.c =================================================================== --- common/pixel.c (revision 157) +++ common/pixel.c (working copy) @@ -180,12 +180,42 @@ PIXEL_AVG_C( pixel_avg_2x2, 2, 2 ) -/* Implicit weighted bipred only: +/* Explicit weighted pred (unidirectional only) + * FIXME: op_scale1 could be memoized */ +#define op_scale1(x) dst[x] = x264_clip_uint8( (dst[x]*i_weight2 + i_offset2) >> 7 ) +static inline void pixel_weight_wxh( uint8_t *dst, int i_dst, int width, int height, int i_log2_denom, int i_weight, int i_offset ){ + const int i_weight2 = i_weight << (7-i_log2_denom); + const int i_offset2 = (i_offset << 7) + (1 << 6); + int y; + for(y=0; y> 6 ) static inline void pixel_avg_weight_wxh( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int width, int height, int i_weight1 ){ + const int i_weight2 = 64 - i_weight1; int y; - const int i_weight2 = 64 - i_weight1; for(y=0; yavg_weight[PIXEL_4x2] = pixel_avg_weight_4x2; pixf->avg_weight[PIXEL_2x4] = pixel_avg_weight_2x4; pixf->avg_weight[PIXEL_2x2] = pixel_avg_weight_2x2; + + pixf->weight[PIXEL_16x16]= pixel_weight_16x16; + pixf->weight[PIXEL_16x8] = pixel_weight_16x8; + pixf->weight[PIXEL_8x16] = pixel_weight_8x16; + pixf->weight[PIXEL_8x8] = pixel_weight_8x8; + pixf->weight[PIXEL_8x4] = pixel_weight_8x4; + pixf->weight[PIXEL_4x8] = pixel_weight_4x8; + pixf->weight[PIXEL_4x4] = pixel_weight_4x4; + pixf->weight[PIXEL_4x2] = pixel_weight_4x2; + pixf->weight[PIXEL_2x4] = pixel_weight_2x4; + pixf->weight[PIXEL_2x2] = pixel_weight_2x2; #ifdef HAVE_MMXEXT if( cpu&X264_CPU_MMXEXT ) Index: common/pixel.h =================================================================== --- common/pixel.h (revision 157) +++ common/pixel.h (working copy) @@ -28,6 +28,7 @@ typedef int (*x264_pixel_satd_t)( uint8_t *, int, uint8_t *, int ); typedef void (*x264_pixel_avg_t) ( uint8_t *, int, uint8_t *, int ); typedef void (*x264_pixel_avg_weight_t) ( uint8_t *, int, uint8_t *, int, int ); +typedef void (*x264_pixel_weight_t) ( uint8_t *, int, int, int, int ); enum { @@ -68,6 +69,7 @@ x264_pixel_satd_t satd[7]; x264_pixel_avg_t avg[10]; x264_pixel_avg_weight_t avg_weight[10]; + x264_pixel_weight_t weight[10]; } x264_pixel_function_t; void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ); Index: common/frame.c =================================================================== --- common/frame.c (revision 157) +++ common/frame.c (working copy) @@ -76,6 +76,17 @@ frame->i_stride[0] * 32 + 32; } + for( i = 0; i < 4; i++ ) + frame->weighted[i] = frame->filtered[i]; + + /* FIXME move somewhere */ + if( !h->mb.p_weight_buf[0] ) + { + for( i = 0; i < 4; i++ ) + h->mb.p_weight_buf[i] = x264_malloc( frame->i_stride[0] * + ( frame->i_lines[0] + 64 ) ); + } + frame->i_stride_lowres = frame->i_stride[0]/2 + 32; frame->i_lines_lowres = frame->i_lines[0]/2; for( i = 0; i < 4; i++ ) Index: common/common.h =================================================================== --- common/common.h (revision 157) +++ common/common.h (working copy) @@ -147,6 +147,11 @@ int arg; } ref_pic_list_order[2][16]; + int b_wpred[16][3]; + int i_wpred_log2_denom[3]; + int i_wpred_scale[16][3]; + int i_wpred_offset[16][3]; + int i_cabac_init_idc; int i_qp_delta; @@ -345,7 +350,8 @@ uint8_t *p_fdec[3]; /* pointer over mb of the references */ - uint8_t *p_fref[2][16][4+2]; /* last: lN, lH, lV, lHV, cU, cV */ + uint8_t *p_fref[2][16][4+2]; /* last: l0, lH, lV, lHV, cU, cV */ + uint8_t *p_fref_w[2][16][4]; /* l0, lH, lV, lHV */ /* common stride */ int i_stride[3]; @@ -379,9 +385,12 @@ int i_last_dqp; /* last delta qp */ int b_variable_qp; /* whether qp is allowed to vary per macroblock */ - /* B_direct and weighted prediction */ + /* weighted prediction */ + int bipred_weight[16][16]; + uint8_t *p_weight_buf[4]; + + /* B_direct */ int dist_scale_factor[16][16]; - int bipred_weight[16][16]; /* maps fref1[0]'s ref indices into the current list0 */ int map_col_to_list0_buf[2]; // for negative indices int map_col_to_list0[16]; Index: x264.h =================================================================== --- x264.h (revision 157) +++ x264.h (working copy) @@ -143,7 +143,8 @@ int i_subpel_refine; /* subpixel motion estimation quality */ int i_mv_range; /* maximum length of a mv (in pixels) */ - int b_weighted_bipred; /* implicit weighting for B-frames */ + int b_weighted_pred; /* weighted prediction for P-frames */ + int b_weighted_bipred; /* implicit weighted prediction for B-frames */ int b_psnr; /* Do we compute PSNR stats (save a few % of cpu) */ } analyse;