Index: encoder/analyse.h
===================================================================
--- encoder/analyse.h	(revision 157)
+++ encoder/analyse.h	(working copy)
@@ -26,5 +26,6 @@
 
 void x264_macroblock_analyse( x264_t *h );
 void x264_slicetype_decide( x264_t *h );
+void x264_weighted_pred_init( x264_t *h, int i_slice_type );
 
 #endif
Index: encoder/macroblock.c
===================================================================
--- encoder/macroblock.c	(revision 157)
+++ encoder/macroblock.c	(working copy)
@@ -507,34 +507,6 @@
 }
 
 /*****************************************************************************
- * x264_macroblock_encode_pskip:
- *  Encode an already marked skip block
- *****************************************************************************/
-void x264_macroblock_encode_pskip( x264_t *h )
-{
-    const int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
-                                h->mb.mv_min[0], h->mb.mv_max[0] );
-    const int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
-                                h->mb.mv_min[1], h->mb.mv_max[1] );
-
-    /* Motion compensation XXX probably unneeded */
-    h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
-                    h->mb.pic.p_fdec[0],       h->mb.pic.i_stride[0],
-                    mvx, mvy, 16, 16 );
-
-    /* Chroma MC */
-    h->mc.mc_chroma( h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
-                      h->mb.pic.p_fdec[1],       h->mb.pic.i_stride[1],
-                      mvx, mvy, 8, 8 );
-
-    h->mc.mc_chroma( h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2],
-                      h->mb.pic.p_fdec[2],       h->mb.pic.i_stride[2],
-                      mvx, mvy, 8, 8 );
-
-    x264_macroblock_encode_skip( h );
-}
-
-/*****************************************************************************
  * x264_macroblock_encode:
  *****************************************************************************/
 void x264_macroblock_encode( x264_t *h )
@@ -545,13 +517,13 @@
 
     if( h->mb.i_type == P_SKIP )
     {
-        /* A bit special */
-        x264_macroblock_encode_pskip( h );
+        /* XXX motion compensation is probably unneeded: it was done during analysis */
+        x264_mb_mc_pskip( h );
+        x264_macroblock_encode_skip( h );
         return;
     }
     if( h->mb.i_type == B_SKIP )
     {
-        /* XXX motion compensation is probably unneeded */
         x264_mb_mc( h );
         x264_macroblock_encode_skip( h );
         return;
@@ -806,6 +778,15 @@
         h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
                         h->mb.pic.p_fdec[0],   h->mb.pic.i_stride[0],
                         mvp[0], mvp[1], 16, 16 );
+
+        /* Weighted prediction */
+        if( h->sh.b_wpred[0][0] )
+        {
+            h->pixf.weight[ PIXEL_16x16 ](
+                        h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0],
+                        h->sh.i_wpred_log2_denom[0], h->sh.i_wpred_scale[0][0],
+                        h->sh.i_wpred_offset[0][0] );
+        }
     }
 
     /* get luma diff */
Index: encoder/slicetype_decision.c
===================================================================
--- encoder/slicetype_decision.c	(revision 157)
+++ encoder/slicetype_decision.c	(working copy)
@@ -28,10 +28,15 @@
 #include "common/common.h"
 #include "common/macroblock.h"
 #include "common/cpu.h"
+#include "common/clip1.h"
 #include "macroblock.h"
 #include "me.h"
 
 
+/****************************************************************************
+ * B-frame placement functions
+ ****************************************************************************/
+
 static void x264_mb_analyse_load_costs_lowres( x264_t *h, x264_mb_analysis_t *a )
 {
     static int16_t *p_cost_mv;
@@ -405,3 +410,181 @@
         frm->i_type = X264_TYPE_B;
     }
 }
+
+
+/****************************************************************************
+ * Weighted prediction functions
+ ****************************************************************************/
+
+static void weight_analyse_plane( x264_t *h, uint8_t *pix, int i_pix_stride, uint8_t *ref, int i_ref_stride,
+                                  int i_width, int i_height, int *i_log2_denom,
+                                  int *wpred, int *wpred_scale, int *wpred_offset )
+{
+    int x, y;
+    int64_t S0=0, S1=0, S00=0, S11=0, S01=0;
+    int n = i_width*i_height;
+    float scale, offset, mult;
+    float err_old, err_affine, err_mult, err_quant;
+
+    for( y = 0; y < i_height; y++ )
+    {
+        for( x = 0; x < i_width; x++ )
+        {
+            const uint8_t e0 = ref[y*i_ref_stride+x];
+            const uint8_t e1 = pix[y*i_ref_stride+x];
+            S0 += e0;
+            S1 += e1;
+            S00 += e0*e0;
+            S01 += e0*e1;
+            S11 += e1*e1;
+        }
+    }
+
+    x264_cpu_restore( h->param.cpu );
+
+    if( S00 == 0 || S0*S0 - S00*n == 0 || S11 + S00 - 2*S01 == 0 )
+        return;
+
+    /* calculate scale and offset to minimize SSD */
+    scale = (float)(S0*S1 - S01*n) / (S0*S0 - S00*n);
+    offset  = (float)(S1 - scale*S0) / n;
+    err_old = (float)(S11 + S00 - 2*S01) / n;
+    err_affine = (S11 + scale*scale*S00 + offset*offset*n + 2*scale*offset*S0 - 2*scale*S01 - 2*offset*S1) / n;
+
+    mult = (float)S01/S00;
+    err_mult = (S00*mult*mult - 2*S01*mult + S11) / n;
+
+//  fprintf( stderr, "S0=%lld S1=%lld S00=%lld S01=%lld S11=%lld n=%d \n",
+//           S0, S1, S00, S01, S11, n );
+//  fprintf( stderr, "mul=%.5f affine=*%.5f%+.5f Eold=%.5f Enew=%.5f Emul=%.5f rat=%.5f \n",
+//           mult, scale, offset, err_old, err_affine, err_mult, err_affine/err_old );
+
+#if 0
+    if( (mult > 1.03 || mult < 0.98)
+        && mult > 0.5 && mult < 2.0
+        && err_mult/err_old < 0.5 
+        && err_old > 1.0
+        && err_mult < 200.0 )
+    {
+        /* FIXME: chroma planes share a log2_denom */
+        if( mult < 1. )
+            *i_log2_denom = 7;
+        else
+            *i_log2_denom = 6;
+
+        *wpred = 1;
+        *wpred_scale = (int)(mult * (1<<*i_log2_denom) + .5);
+        *wpred_offset = 0;
+
+        //
+        float multq = (float)*wpred_scale / (1<<*i_log2_denom);
+        err_quant = (S00*multq*multq - 2*S01*multq + S11) / n;
+
+        fprintf( stderr, "wpred=1 trans = %d/%d %c %d,  Eold=%.4f Emul=%.4f Equant=%.4f \n",
+                 *wpred_scale, 1<<*i_log2_denom, *wpred_offset>0 ? '+' : '-', abs(*wpred_offset),
+                 err_old, err_mult, err_quant );
+    }
+#else
+    if( (scale > 1.03 || scale < 0.98 || offset > 2 || offset < -2)
+        && scale > 0.5 && scale < 2.0
+        && err_affine/err_old < 0.5 
+        && err_old > 1.0
+        && err_affine < 200.0 )
+    {
+        /* FIXME: chroma planes share a log2_denom */
+        if( scale < 1. )
+            *i_log2_denom = 7;
+        else
+            *i_log2_denom = 6;
+
+        *wpred = 1;
+        *wpred_scale = (int)(scale * (1<<*i_log2_denom) + .5);
+        *wpred_offset = (int)(offset + .5);
+
+        //
+        float multq = (float)*wpred_scale / (1<<*i_log2_denom);
+        float addq = (float)*wpred_offset;
+        err_quant = (S11 + multq*multq*S00 + addq*addq*n + 2*multq*addq*S0 - 2*multq*S01 - 2*addq*S1) / n;
+
+        fprintf( stderr, "wpred=1 trans = %d/%d %c %d,  Eold=%.4f Enew=%.4f Equant=%.4f \n",
+                 *wpred_scale, 1<<*i_log2_denom, *wpred_offset>0 ? '+' : '-', abs(*wpred_offset),
+                 err_old, err_affine, err_quant );
+    }
+#endif
+    else
+        *i_log2_denom = 0;
+}
+
+static void weight_scale_plane( uint8_t *pix, int i_pix_stride, uint8_t *ref, int i_ref_stride,
+                                int i_width, int i_height, int i_log2_denom,
+                                int wpred_scale, int wpred_offset )
+{
+    uint8_t transform[256];
+    int x, y, i;
+
+    for( i = 0; i < 256; i++ )
+    {
+        int v;
+        if( i_log2_denom )
+            v = ((i * wpred_scale + (1 << (i_log2_denom-1))) >> i_log2_denom) + wpred_offset;
+        else
+            v = i * wpred_scale + wpred_offset;
+        transform[i] = x264_clip_uint8( v );
+    }
+
+    for( y = 0; y < i_height; y++ )
+        for( x = 0; x < i_width; x++ )
+            pix[y*i_pix_stride+x] = transform[ ref[y*i_ref_stride+x] ];
+}
+
+void x264_weighted_pred_init( x264_t *h, int i_slice_type )
+{
+    int i_ref;
+    int i, c;
+
+    /* the rest of the refs have wpred initted to 0 and never changed */
+    for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
+        for( i = 0; i < 4; i++ )
+            h->fref0[i_ref]->weighted[i] = h->fref0[i_ref]->filtered[i];
+    for( i_ref = 0; i_ref < h->i_ref1; i_ref++ )
+        for( i = 0; i < 4; i++ )
+            h->fref1[i_ref]->weighted[i] = h->fref1[i_ref]->filtered[i];
+    memset( h->sh.b_wpred, 0, sizeof(h->sh.b_wpred) );
+
+    if( i_slice_type != SLICE_TYPE_P )
+        return;
+
+    /* calculate weights for ref0 */
+    for( c = 0; c < 1 /*3*/; c++ )
+    {
+        int *log_denom = &h->sh.i_wpred_log2_denom[c];
+        int *wpred = &h->sh.b_wpred[0][c];
+        int *scale = &h->sh.i_wpred_scale[0][c];
+        int *offset = &h->sh.i_wpred_offset[0][c];
+        *log_denom = 6;
+        *wpred = 0;
+        *scale = 1<<6;
+        *offset = 0;
+
+        weight_analyse_plane( h, h->fenc->plane[c], h->fenc->i_stride[c],
+                              h->fref0[0]->plane[c], h->fref0[0]->i_stride[c],
+                              h->param.i_width >> !!c, h->param.i_height >> !!c,
+                              log_denom, wpred, scale, offset );
+    }
+
+    /* filter the luma plane for motion estimation */
+    if( h->sh.b_wpred[0][0] )
+    {
+        for( i = 0; i < 4; i++ )
+        {
+            h->fref0[0]->weighted[i] = h->mb.p_weight_buf[i] + h->fref0[0]->i_stride[0] * 32 + 32;
+
+            weight_scale_plane( h->fref0[0]->weighted[i], h->fref0[0]->i_stride[c],
+                                h->fref0[0]->filtered[i], h->fref0[0]->i_stride[c],
+                                h->param.i_width, h->param.i_height,
+                                h->sh.i_wpred_log2_denom[0],
+                                h->sh.i_wpred_scale[0][0], h->sh.i_wpred_offset[0][0] );
+        }
+    }
+}
+
Index: encoder/encoder.c
===================================================================
--- encoder/encoder.c	(revision 157)
+++ encoder/encoder.c	(working copy)
@@ -277,9 +277,34 @@
         }
     }
 
-    if( ( sh->pps->b_weighted_pred && ( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_SP ) ) ||
-        ( sh->pps->b_weighted_bipred == 1 && sh->i_type == SLICE_TYPE_B ) )
+    if( sh->pps->b_weighted_pred && ( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_SP ) )
     {
+        bs_write_ue( s, sh->i_wpred_log2_denom[0] );
+        bs_write_ue( s, sh->i_wpred_log2_denom[1] );
+        for( i = 0; i < sh->i_num_ref_idx_l0_active; i++ )
+        {
+            int luma_weight_l0_flag = sh->b_wpred[i][0];
+            int chroma_weight_l0_flag = sh->b_wpred[i][1] || sh->b_wpred[i][2];
+            bs_write1( s, luma_weight_l0_flag );
+            if( luma_weight_l0_flag )
+            {
+                bs_write_se( s, sh->i_wpred_scale[i][0] );
+                bs_write_se( s, sh->i_wpred_offset[i][0] );
+            }
+            bs_write1( s, chroma_weight_l0_flag );
+            if( chroma_weight_l0_flag )
+            {
+                int j;
+                for( j = 1; j < 3; j++ )
+                {
+                    bs_write_se( s, sh->i_wpred_scale[i][j] );
+                    bs_write_se( s, sh->i_wpred_offset[i][j] );
+                }
+            }
+        }
+    }
+    else if( sh->pps->b_weighted_bipred == 1 && sh->i_type == SLICE_TYPE_B )
+    {
         /* FIXME */
     }
 
@@ -340,6 +365,8 @@
     x264_t *h = x264_malloc( sizeof( x264_t ) );
     int i, i_slice;
 
+    memset( h, 0, sizeof( x264_t ) );
+
     /* Create a copy of param */
     memcpy( &h->param, param, sizeof( x264_param_t ) );
     if( h->param.rc.psz_stat_out )
@@ -1127,6 +1154,8 @@
     /* build ref list 0/1 */
     x264_reference_build_list( h, h->fdec->i_poc, i_slice_type );
 
+    if( h->param.analyse.b_weighted_pred )
+        x264_weighted_pred_init( h, i_slice_type );
     if( i_slice_type == SLICE_TYPE_B )
         x264_macroblock_bipred_init( h );
 
Index: encoder/set.c
===================================================================
--- encoder/set.c	(revision 157)
+++ encoder/set.c	(working copy)
@@ -293,7 +293,7 @@
     pps->i_num_ref_idx_l0_active = 1;
     pps->i_num_ref_idx_l1_active = 1;
 
-    pps->b_weighted_pred = 0;
+    pps->b_weighted_pred = param->analyse.b_weighted_pred ? 1 : 0;
     pps->b_weighted_bipred = param->analyse.b_weighted_bipred ? 2 : 0;
 
     pps->i_pic_init_qp = 26;
Index: encoder/analyse.c
===================================================================
--- encoder/analyse.c	(revision 157)
+++ encoder/analyse.c	(working copy)
@@ -541,7 +541,7 @@
         i_fullpel_thresh -= i_ref_cost;
 
         /* search with ref */
-        LOAD_HPELS( m.p_fref, h->mb.pic.p_fref[0][i_ref], 0 );
+        LOAD_HPELS( m.p_fref, h->mb.pic.p_fref_w[0][i_ref], 0 );
         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
         x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
@@ -569,7 +569,7 @@
 
 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
 {
-    uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
+    uint8_t  **p_fref = h->mb.pic.p_fref_w[0][a->l0.i_ref];
     uint8_t  *p_fenc = h->mb.pic.p_fenc[0];
     int mvc[5][2], i_mvc;
     int i;
@@ -613,7 +613,7 @@
 
 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
 {
-    uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
+    uint8_t  **p_fref = h->mb.pic.p_fref_w[0][a->l0.i_ref];
     uint8_t  *p_fenc = h->mb.pic.p_fenc[0];
     int mvc[2][2];
     int i;
@@ -648,7 +648,7 @@
 
 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
 {
-    uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
+    uint8_t  **p_fref = h->mb.pic.p_fref_w[0][a->l0.i_ref];
     uint8_t  *p_fenc = h->mb.pic.p_fenc[0];
     int mvc[2][2];
     int i;
@@ -683,7 +683,7 @@
 
 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
 {
-    uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
+    uint8_t  **p_fref = h->mb.pic.p_fref_w[0][a->l0.i_ref];
     uint8_t  *p_fenc = h->mb.pic.p_fenc[0];
 
     int i4x4;
@@ -722,7 +722,7 @@
 
 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
 {
-    uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
+    uint8_t  **p_fref = h->mb.pic.p_fref_w[0][a->l0.i_ref];
     uint8_t  *p_fenc = h->mb.pic.p_fenc[0];
 
     int i8x4;
@@ -758,7 +758,7 @@
 
 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
 {
-    uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
+    uint8_t  **p_fref = h->mb.pic.p_fref_w[0][a->l0.i_ref];
     uint8_t  *p_fenc = h->mb.pic.p_fenc[0];
 
     int i4x8;
@@ -1008,20 +1008,23 @@
 }
 #undef CACHE_MV_BI
 
+#define INIT_FREF \
+    uint8_t *p_fref[2][4] = \
+        { { \
+            h->mb.pic.p_fref[0][a->l0.i_ref][0], \
+            h->mb.pic.p_fref[0][a->l0.i_ref][1], \
+            h->mb.pic.p_fref[0][a->l0.i_ref][2], \
+            h->mb.pic.p_fref[0][a->l0.i_ref][3]  \
+        }, { \
+            h->mb.pic.p_fref[1][a->l1.i_ref][0], \
+            h->mb.pic.p_fref[1][a->l1.i_ref][1], \
+            h->mb.pic.p_fref[1][a->l1.i_ref][2], \
+            h->mb.pic.p_fref[1][a->l1.i_ref][3]  \
+        } };
+
 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
 {
-    uint8_t *p_fref[2][4] =
-        { {
-            h->mb.pic.p_fref[0][a->l0.i_ref][0],
-            h->mb.pic.p_fref[0][a->l0.i_ref][1],
-            h->mb.pic.p_fref[0][a->l0.i_ref][2],
-            h->mb.pic.p_fref[0][a->l0.i_ref][3]
-        }, {
-            h->mb.pic.p_fref[1][a->l1.i_ref][0],
-            h->mb.pic.p_fref[1][a->l1.i_ref][1],
-            h->mb.pic.p_fref[1][a->l1.i_ref][2],
-            h->mb.pic.p_fref[1][a->l1.i_ref][3] 
-        } };
+    INIT_FREF;
     uint8_t *p_fenc = h->mb.pic.p_fenc[0];
     uint8_t pix[2][8*8];
     int i, l;
@@ -1098,18 +1101,7 @@
 
 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
 {
-    uint8_t *p_fref[2][4] =
-        { {
-            h->mb.pic.p_fref[0][a->l0.i_ref][0],
-            h->mb.pic.p_fref[0][a->l0.i_ref][1],
-            h->mb.pic.p_fref[0][a->l0.i_ref][2],
-            h->mb.pic.p_fref[0][a->l0.i_ref][3]
-        }, {
-            h->mb.pic.p_fref[1][a->l1.i_ref][0],
-            h->mb.pic.p_fref[1][a->l1.i_ref][1],
-            h->mb.pic.p_fref[1][a->l1.i_ref][2],
-            h->mb.pic.p_fref[1][a->l1.i_ref][3] 
-        } };
+    INIT_FREF;
     uint8_t *p_fenc = h->mb.pic.p_fenc[0];
     uint8_t pix[2][16*8];
     int i_ref_stride = h->mb.pic.i_stride[0];
@@ -1182,18 +1174,7 @@
 }
 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
 {
-    uint8_t *p_fref[2][4] =
-        { {
-            h->mb.pic.p_fref[0][a->l0.i_ref][0],
-            h->mb.pic.p_fref[0][a->l0.i_ref][1],
-            h->mb.pic.p_fref[0][a->l0.i_ref][2],
-            h->mb.pic.p_fref[0][a->l0.i_ref][3]
-        }, {
-            h->mb.pic.p_fref[1][a->l1.i_ref][0],
-            h->mb.pic.p_fref[1][a->l1.i_ref][1],
-            h->mb.pic.p_fref[1][a->l1.i_ref][2],
-            h->mb.pic.p_fref[1][a->l1.i_ref][3] 
-        } };
+    INIT_FREF;
     uint8_t *p_fenc = h->mb.pic.p_fenc[0];
     uint8_t pix[2][8*16];
     int i_ref_stride = h->mb.pic.i_stride[0];
Index: x264.c
===================================================================
--- x264.c	(revision 157)
+++ x264.c	(working copy)
@@ -141,7 +141,8 @@
              "                                  - none, all\n"
              "      --direct <string>       Direct MV prediction mode [\"temporal\"]\n"
              "                                  - none, spatial, temporal\n"
-             "  -w, --weightb               Weighted prediction for B-frames\n"
+             "      --weightb               Weighted prediction for B-frames\n"
+             "      --weightp               Weighted prediction for P-frames\n"
              "  -m, --subme <integer>       Subpixel motion estimation quality: 1=fast, 5=best. [%d]\n"
              "\n"
              "      --level <integer>       Specify IDC level\n"
@@ -224,6 +225,8 @@
 #define OPT_NOBADAPT 277
 #define OPT_BBIAS 278
 #define OPT_BPYRAMID 279
+#define OPT_WEIGHTP 280
+#define OPT_WEIGHTB 281
 
         static struct option long_options[] =
         {
@@ -251,7 +254,8 @@
             { "output",  required_argument, NULL, 'o' },
             { "analyse", required_argument, NULL, 'A' },
             { "direct",  required_argument, NULL, OPT_DIRECT },
-            { "weightb", no_argument,       NULL, 'w' },
+            { "weightp", no_argument,       NULL, OPT_WEIGHTP },
+            { "weightb", no_argument,       NULL, OPT_WEIGHTB },
             { "subme",   required_argument, NULL, 'm' },
             { "level",   required_argument, NULL, OPT_LEVEL },
             { "rcsens",  required_argument, NULL, OPT_RCSENS },
@@ -273,7 +277,7 @@
 
         int c;
 
-        c = getopt_long( argc, argv, "hi:I:b:r:cxB:q:nf:o:s:A:m:p:vw",
+        c = getopt_long( argc, argv, "hi:I:b:r:cxB:q:nf:o:s:A:m:p:v",
                          long_options, &long_options_index);
 
         if( c == -1 )
@@ -407,7 +411,10 @@
                 else
                     param->analyse.i_direct_mv_pred = atoi( optarg );
                 break;
-            case 'w':
+            case OPT_WEIGHTP:
+                param->analyse.b_weighted_pred = 1;
+                break;
+            case OPT_WEIGHTB:
                 param->analyse.b_weighted_bipred = 1;
                 break;
             case 'm':
Index: common/frame.h
===================================================================
--- common/frame.h	(revision 157)
+++ common/frame.h	(working copy)
@@ -43,6 +43,7 @@
     int     i_lines_lowres;
     uint8_t *plane[4];
     uint8_t *filtered[4]; /* plane[0], H, V, HV */
+    uint8_t *weighted[4]; /* 0, H, V, HV */
     uint8_t *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */
 
     /* for unrestricted mv we allocate more data than needed
Index: common/macroblock.c
===================================================================
--- common/macroblock.c	(revision 157)
+++ common/macroblock.c	(working copy)
@@ -567,9 +567,16 @@
     const int mvy   = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
 
     h->mc.mc_luma( h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0],
-                    &h->mb.pic.p_fdec[0][4*y * h->mb.pic.i_stride[0]+4*x],           h->mb.pic.i_stride[0],
+                    &h->mb.pic.p_fdec[0][4*y * h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0],
                     mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height );
 
+    if( h->sh.b_wpred[i_ref][0] )
+    {
+        h->pixf.weight[ x264_size2pixel[height][width] ](
+                    &h->mb.pic.p_fdec[0][4*y * h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0],
+                    h->sh.i_wpred_log2_denom[0], h->sh.i_wpred_scale[i_ref][0], h->sh.i_wpred_offset[i_ref][0] );
+    }
+
     h->mc.mc_chroma( &h->mb.pic.p_fref[0][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
                       &h->mb.pic.p_fdec[1][2*y*h->mb.pic.i_stride[1]+2*x],           h->mb.pic.i_stride[1],
                       mvx, mvy, 2*width, 2*height );
@@ -820,6 +827,34 @@
     }
 }
 
+void x264_mb_mc_pskip( x264_t *h )
+{
+    const int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
+                                h->mb.mv_min[0], h->mb.mv_max[0] );
+    const int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
+                                h->mb.mv_min[1], h->mb.mv_max[1] );
+
+    h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
+                    h->mb.pic.p_fdec[0],       h->mb.pic.i_stride[0],
+                    mvx, mvy, 16, 16 );
+
+    if( h->sh.b_wpred[0][0] )
+    {
+        h->pixf.weight[ PIXEL_16x16 ](
+                    h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0],
+                    h->sh.i_wpred_log2_denom[0], h->sh.i_wpred_scale[0][0],
+                    h->sh.i_wpred_offset[0][0] );
+    }
+
+    h->mc.mc_chroma( h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
+                      h->mb.pic.p_fdec[1],       h->mb.pic.i_stride[1],
+                      mvx, mvy, 8, 8 );
+
+    h->mc.mc_chroma( h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2],
+                      h->mb.pic.p_fdec[2],       h->mb.pic.i_stride[2],
+                      mvx, mvy, 8, 8 );
+}
+
 void x264_macroblock_cache_init( x264_t *h )
 {
     int i, j;
@@ -937,25 +972,33 @@
     /* load picture pointers */
     for( i = 0; i < 3; i++ )
     {
-        const int w = (i == 0 ? 16 : 8);
         const int i_stride = h->fdec->i_stride[i];
+        const int i_xy = (i == 0 ? 16 : 8) * ( i_mb_x + i_mb_y * i_stride );
         int   j;
 
         h->mb.pic.i_stride[i] = i_stride;
 
-        h->mb.pic.p_fenc[i] = &h->fenc->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )];
+        h->mb.pic.p_fenc[i] = &h->fenc->plane[i][i_xy];
+        h->mb.pic.p_fdec[i] = &h->fdec->plane[i][i_xy];
 
-        h->mb.pic.p_fdec[i] = &h->fdec->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )];
-
         for( j = 0; j < h->i_ref0; j++ )
+            h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &h->fref0[j]->plane[i][i_xy];
+        for( j = 0; j < h->i_ref1; j++ )
+            h->mb.pic.p_fref[1][j][i==0 ? 0:i+3] = &h->fref1[j]->plane[i][i_xy];
+    }
+    for( i = 0; i < 4; i++ )
+    {
+        const int i_xy = 16 * ( i_mb_x + i_mb_y * h->fdec->i_stride[0] );
+        int j;
+        for( j = 0; j < h->i_ref0; j++ )
         {
-            h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &h->fref0[j]->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )];
-            h->mb.pic.p_fref[0][j][i+1] = &h->fref0[j]->filtered[i+1][ 16 * ( i_mb_x + i_mb_y * h->fdec->i_stride[0] )];
+            h->mb.pic.p_fref[0][j][i]   = &h->fref0[j]->filtered[i][i_xy];
+            h->mb.pic.p_fref_w[0][j][i] = &h->fref0[j]->weighted[i][i_xy];
         }
         for( j = 0; j < h->i_ref1; j++ )
         {
-            h->mb.pic.p_fref[1][j][i==0 ? 0:i+3] = &h->fref1[j]->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )];
-            h->mb.pic.p_fref[1][j][i+1] = &h->fref1[j]->filtered[i+1][ 16 * ( i_mb_x + i_mb_y * h->fdec->i_stride[0] )];
+            h->mb.pic.p_fref[1][j][i]   = &h->fref1[j]->filtered[i][i_xy];
+            h->mb.pic.p_fref_w[1][j][i] = &h->fref1[j]->weighted[i][i_xy];
         }
     }
 
Index: common/macroblock.h
===================================================================
--- common/macroblock.h	(revision 157)
+++ common/macroblock.h	(working copy)
@@ -195,6 +195,7 @@
 void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale );
 
 void x264_mb_mc( x264_t *h );
+void x264_mb_mc_pskip( x264_t *h );
 
 
 static inline void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, int ref )
Index: common/pixel.c
===================================================================
--- common/pixel.c	(revision 157)
+++ common/pixel.c	(working copy)
@@ -180,12 +180,42 @@
 PIXEL_AVG_C( pixel_avg_2x2,   2, 2 )
 
 
-/* Implicit weighted bipred only:
+/* Explicit weighted pred (unidirectional only)
+ * FIXME: op_scale1 could be memoized */
+#define op_scale1(x) dst[x] = x264_clip_uint8( (dst[x]*i_weight2 + i_offset2) >> 7 )
+static inline void pixel_weight_wxh( uint8_t *dst, int i_dst, int width, int height, int i_log2_denom, int i_weight, int i_offset ){
+    const int i_weight2 = i_weight << (7-i_log2_denom);
+    const int i_offset2 = (i_offset << 7) + (1 << 6);
+    int y;
+    for(y=0; y<height; y++, dst += i_dst){
+        op_scale1(0);
+        op_scale1(1);
+        if(width==2) continue;
+        op_scale1(2);
+        op_scale1(3);
+        if(width==4) continue;
+        op_scale1(4);
+        op_scale1(5);
+        op_scale1(6);
+        op_scale1(7);
+        if(width==8) continue;
+        op_scale1(8);
+        op_scale1(9);
+        op_scale1(10);
+        op_scale1(11);
+        op_scale1(12);
+        op_scale1(13);
+        op_scale1(14);
+        op_scale1(15);
+    }
+}
+
+/* Implicit weighted bipred
  * assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */
 #define op_scale2(x) dst[x] = x264_clip_uint8( (dst[x]*i_weight1 + src[x]*i_weight2 + (1<<5)) >> 6 )
 static inline void pixel_avg_weight_wxh( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int width, int height, int i_weight1 ){
+    const int i_weight2 = 64 - i_weight1;
     int y;
-    const int i_weight2 = 64 - i_weight1;
     for(y=0; y<height; y++, dst += i_dst, src += i_src){
         op_scale2(0);
         op_scale2(1);
@@ -209,26 +239,32 @@
     }
 }
 
-#define PIXEL_AVG_WEIGHT_C( width, height ) \
+#define PIXEL_WEIGHT_C( width, height ) \
 static void pixel_avg_weight_##width##x##height( \
                 uint8_t *pix1, int i_stride_pix1, \
                 uint8_t *pix2, int i_stride_pix2, int i_weight1 ) \
 { \
     pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height, i_weight1 ); \
+} \
+static void pixel_weight_##width##x##height( \
+                uint8_t *pix, int i_stride_pix, int i_log2_denom, int i_weight, int i_offset ) \
+{ \
+    pixel_weight_wxh( pix, i_stride_pix, width, height, i_log2_denom, i_weight, i_offset ); \
 }
 
-PIXEL_AVG_WEIGHT_C(16,16)
-PIXEL_AVG_WEIGHT_C(16,8)
-PIXEL_AVG_WEIGHT_C(8,16)
-PIXEL_AVG_WEIGHT_C(8,8)
-PIXEL_AVG_WEIGHT_C(8,4)
-PIXEL_AVG_WEIGHT_C(4,8)
-PIXEL_AVG_WEIGHT_C(4,4)
-PIXEL_AVG_WEIGHT_C(4,2)
-PIXEL_AVG_WEIGHT_C(2,4)
-PIXEL_AVG_WEIGHT_C(2,2)
+PIXEL_WEIGHT_C(16,16)
+PIXEL_WEIGHT_C(16,8)
+PIXEL_WEIGHT_C(8,16)
+PIXEL_WEIGHT_C(8,8)
+PIXEL_WEIGHT_C(8,4)
+PIXEL_WEIGHT_C(4,8)
+PIXEL_WEIGHT_C(4,4)
+PIXEL_WEIGHT_C(4,2)
+PIXEL_WEIGHT_C(2,4)
+PIXEL_WEIGHT_C(2,2)
+#undef op_scale1
 #undef op_scale2
-#undef PIXEL_AVG_WEIGHT_C
+#undef PIXEL_WEIGHT_C
 
 /****************************************************************************
  * x264_pixel_init:
@@ -272,6 +308,17 @@
     pixf->avg_weight[PIXEL_4x2]  = pixel_avg_weight_4x2;
     pixf->avg_weight[PIXEL_2x4]  = pixel_avg_weight_2x4;
     pixf->avg_weight[PIXEL_2x2]  = pixel_avg_weight_2x2;
+    
+    pixf->weight[PIXEL_16x16]= pixel_weight_16x16;
+    pixf->weight[PIXEL_16x8] = pixel_weight_16x8;
+    pixf->weight[PIXEL_8x16] = pixel_weight_8x16;
+    pixf->weight[PIXEL_8x8]  = pixel_weight_8x8;
+    pixf->weight[PIXEL_8x4]  = pixel_weight_8x4;
+    pixf->weight[PIXEL_4x8]  = pixel_weight_4x8;
+    pixf->weight[PIXEL_4x4]  = pixel_weight_4x4;
+    pixf->weight[PIXEL_4x2]  = pixel_weight_4x2;
+    pixf->weight[PIXEL_2x4]  = pixel_weight_2x4;
+    pixf->weight[PIXEL_2x2]  = pixel_weight_2x2;
 
 #ifdef HAVE_MMXEXT
     if( cpu&X264_CPU_MMXEXT )
Index: common/pixel.h
===================================================================
--- common/pixel.h	(revision 157)
+++ common/pixel.h	(working copy)
@@ -28,6 +28,7 @@
 typedef int  (*x264_pixel_satd_t)( uint8_t *, int, uint8_t *, int );
 typedef void (*x264_pixel_avg_t) ( uint8_t *, int, uint8_t *, int );
 typedef void (*x264_pixel_avg_weight_t) ( uint8_t *, int, uint8_t *, int, int );
+typedef void (*x264_pixel_weight_t) ( uint8_t *, int, int, int, int );
 
 enum
 {
@@ -68,6 +69,7 @@
     x264_pixel_satd_t satd[7];
     x264_pixel_avg_t  avg[10];
     x264_pixel_avg_weight_t avg_weight[10];
+    x264_pixel_weight_t weight[10];
 } x264_pixel_function_t;
 
 void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
Index: common/frame.c
===================================================================
--- common/frame.c	(revision 157)
+++ common/frame.c	(working copy)
@@ -76,6 +76,17 @@
                                 frame->i_stride[0] * 32 + 32;
     }
 
+    for( i = 0; i < 4; i++ )
+        frame->weighted[i] = frame->filtered[i];
+    
+    /* FIXME move somewhere */
+    if( !h->mb.p_weight_buf[0] )
+    {
+        for( i = 0; i < 4; i++ )
+            h->mb.p_weight_buf[i] = x264_malloc( frame->i_stride[0] *
+                                               ( frame->i_lines[0] + 64 ) );
+    }
+
     frame->i_stride_lowres = frame->i_stride[0]/2 + 32;
     frame->i_lines_lowres = frame->i_lines[0]/2;
     for( i = 0; i < 4; i++ )
Index: common/common.h
===================================================================
--- common/common.h	(revision 157)
+++ common/common.h	(working copy)
@@ -147,6 +147,11 @@
         int arg;
     } ref_pic_list_order[2][16];
 
+    int b_wpred[16][3];
+    int i_wpred_log2_denom[3];
+    int i_wpred_scale[16][3];
+    int i_wpred_offset[16][3];
+
     int i_cabac_init_idc;
 
     int i_qp_delta;
@@ -345,7 +350,8 @@
             uint8_t *p_fdec[3];
 
             /* pointer over mb of the references */
-            uint8_t *p_fref[2][16][4+2]; /* last: lN, lH, lV, lHV, cU, cV */
+            uint8_t *p_fref[2][16][4+2]; /* last: l0, lH, lV, lHV, cU, cV */
+            uint8_t *p_fref_w[2][16][4]; /* l0, lH, lV, lHV */
 
             /* common stride */
             int     i_stride[3];
@@ -379,9 +385,12 @@
         int     i_last_dqp; /* last delta qp */
         int     b_variable_qp; /* whether qp is allowed to vary per macroblock */
 
-        /* B_direct and weighted prediction */
+        /* weighted prediction */
+        int     bipred_weight[16][16];
+        uint8_t *p_weight_buf[4];
+
+        /* B_direct */
         int     dist_scale_factor[16][16];
-        int     bipred_weight[16][16];
         /* maps fref1[0]'s ref indices into the current list0 */
         int     map_col_to_list0_buf[2]; // for negative indices
         int     map_col_to_list0[16];
Index: x264.h
===================================================================
--- x264.h	(revision 157)
+++ x264.h	(working copy)
@@ -143,7 +143,8 @@
         int          i_subpel_refine; /* subpixel motion estimation quality */
         int          i_mv_range; /* maximum length of a mv (in pixels) */
 
-        int          b_weighted_bipred; /* implicit weighting for B-frames */
+        int          b_weighted_pred;   /* weighted prediction for P-frames */
+        int          b_weighted_bipred; /* implicit weighted prediction for B-frames */
 
         int          b_psnr;    /* Do we compute PSNR stats (save a few % of cpu) */
     } analyse;