Index: encoder/me.c
===================================================================
--- encoder/me.c	(revision 27)
+++ encoder/me.c	(working copy)
@@ -25,10 +25,24 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
+#include <assert.h>
 
 #include "common/common.h"
 #include "me.h"
 
+#define BENCH 0
+
+#if BENCH
+static uint64_t sad16_calls=0, sad16_new=0;
+static uint64_t sad8_calls=0, sad8_new=0;
+static uint64_t sad_print=1;
+#define SAD_CALL(s)  sad##s##_calls++;
+#define SAD_NEW(s)   sad##s##_new++;
+#else
+#define SAD_CALL(s)
+#define SAD_NEW(s)
+#endif
+
 /* presets selected from good points on the speed-vs-quality curve of several test videos
  * subpel_iters[i_subpel_refine] = { refine_hpel, refine_qpel, me_hpel, me_qpel }
  * where me_* are the number of EPZS iterations run on all candidate block types,
@@ -43,19 +57,38 @@
     {0,0,2,3}};
 
 static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters );
+static void refine_subpel_multipart( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int hpel_iters, int qpel_iters );
 
+//FIXME
+#if 0
+#define COPY3_IF_LT(x,y,a,b,c,d)\
+asm volatile (\
+    "cmpl %0, %3    \n\t"\
+    "cmovl %3, %0   \n\t"\
+    "cmovl %4, %1   \n\t"\
+    "cmovl %5, %2   \n\t"\
+    : "+r" (x), "+r" (a), "+r" (c)\
+    : "r" (y), "r" (b), "r" (d)\
+);
+#else
+#define COPY3_IF_LT(x,y,a,b,c,d) \
+{ \
+    if((y)<(x)) \
+    { \
+        (x)=(y);\
+        (a)=(b);\
+        (c)=(d);\
+    } \
+}
+#endif
+
 #define COST_MV( mx, my ) \
 { \
     int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], m->i_stride[0],     \
                    &p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0] ) \
              + p_cost_mvx[ (mx)<<2 ]  \
              + p_cost_mvy[ (my)<<2 ]; \
-    if( cost < bcost ) \
-    {                  \
-        bcost = cost;  \
-        bmx = mx;      \
-        bmy = my;      \
-    } \
+    COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \
 }
 
 void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_fullpel_thresh )
@@ -86,7 +119,7 @@
     bmx = pmx = x264_clip3( ( m->mvp[0] + 2 ) >> 2, mv_x_min, mv_x_max );
     bmy = pmy = x264_clip3( ( m->mvp[1] + 2 ) >> 2, mv_y_min, mv_y_max );
     bcost = COST_MAX;
-    COST_MV( bmx, bmy );
+    COST_MV( pmx, pmy );
     /* I don't know why this helps */
     bcost -= p_cost_mvx[ bmx<<2 ] + p_cost_mvy[ bmy<<2 ];
 
@@ -283,6 +316,7 @@
 }
 #undef COST_MV
 
+
 void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
 {
     int hpel = subpel_iterations[h->mb.i_subpel_refine][0];
@@ -354,4 +388,507 @@
     m->mv[1] = bmy;
     m->cost_mv = p_cost_mvx[ bmx ] + p_cost_mvy[ bmy ];
 }
+#undef COST_MV
 
+
+
+
+#define MVD( mvp, mx, my ) \
+    p_cost_mv[ ((mx)<<2) - (mvp)[0] ] + p_cost_mv[ ((my)<<2) - (mvp)[1] ]
+
+static int cost_mv_multipart_16x16( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int mx, int my )
+{
+    const int i_stride = m->i_stride[0];
+    const int16_t *p_cost_mv = m->p_cost_mv;
+    int sad[4];
+    int i8;
+    int mx2 = mx<<2, my2 = my<<2;
+    uint8_t *cache_entry;
+
+    SAD_CALL(16);
+    if( CACHE_FPEL_IN_RANGE( mx, my ) )
+    {
+        cache_entry = &l->p_cache_fpel[ CACHE_FPEL_OFFS( mx, my ) ];
+        if( *cache_entry )
+            return 0xffff;
+        *cache_entry = 1;
+    }
+    SAD_NEW(16);
+        
+    h->pixf.sad_split16x16( m->p_fenc[0], i_stride, &m->p_fref[0][mx+my*i_stride], i_stride, sad );
+
+    for( i8 = 0; i8 < 4; i8++ )
+        COPY3_IF_LT( l->me8x8[i8].cost, sad[i8] + MVD(l->me8x8[i8].mvp, mx, my),
+                     l->me8x8[i8].mv[0], mx2, l->me8x8[i8].mv[1], my2 );
+    COPY3_IF_LT( l->me16x8[0].cost, sad[0]+sad[1] + MVD(l->me16x8[0].mvp, mx, my),
+                 l->me16x8[0].mv[0], mx2, l->me16x8[0].mv[1], my2 );
+    COPY3_IF_LT( l->me16x8[1].cost, sad[2]+sad[3] + MVD(l->me16x8[1].mvp, mx, my),
+                 l->me16x8[1].mv[0], mx2, l->me16x8[1].mv[1], my2 );
+    COPY3_IF_LT( l->me8x16[0].cost, sad[0]+sad[2] + MVD(l->me8x16[0].mvp, mx, my),
+                 l->me8x16[0].mv[0], mx2, l->me8x16[0].mv[1], my2 );
+    COPY3_IF_LT( l->me8x16[1].cost, sad[1]+sad[3] + MVD(l->me8x16[1].mvp, mx, my),
+                 l->me8x16[1].mv[0], mx2, l->me8x16[1].mv[1], my2 );
+    return sad[0]+sad[1]+sad[2]+sad[3] + MVD(m->mvp, mx, my);
+}
+static int cost_mv_multipart_8x8( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int mx, int my )
+{
+    const int i_stride = m->i_stride[0];
+    const int16_t *p_cost_mv = m->p_cost_mv;
+
+    SAD_CALL(8);
+    if( CACHE_FPEL_IN_RANGE( mx, my ) && l->p_cache_fpel[ CACHE_FPEL_OFFS( mx, my ) ] )
+        return 0xffff;
+    SAD_NEW(8);
+
+    return h->pixf.sad[m->i_pixel]( m->p_fenc[0], i_stride, &m->p_fref[0][my*i_stride+mx], i_stride )
+         + MVD(m->mvp, mx, my);
+}
+
+static int (*const cost_mv_multipart[4])( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int mx, int my ) =
+{
+    cost_mv_multipart_16x16, cost_mv_multipart_8x8, cost_mv_multipart_8x8, cost_mv_multipart_8x8
+};
+
+#define COST_MV( mx, my ) \
+{ \
+    int cost = cost_mv_multipart[ m->i_pixel ]( h, l, m, mx, my ); \
+    COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \
+}
+
+void x264_me_search_multipart( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_fullpel_thresh )
+{
+    const int i_me_range = h->param.analyse.i_me_range;
+    const int16_t *p_cost_mv = m->p_cost_mv;
+    const int *mvp = m->mvp;
+    uint8_t *p_fref = m->p_fref[0];
+    int bmx, bmy, bcost;
+    int omx, omy, pmx, pmy;
+    int i, j;
+
+    int mv_x_min = h->mb.mv_min_fpel[0];
+    int mv_y_min = h->mb.mv_min_fpel[1];
+    int mv_x_max = h->mb.mv_max_fpel[0];
+    int mv_y_max = h->mb.mv_max_fpel[1];
+
+    /* clamp mvp to inside frame+padding, so that we don't have to check it each iteration */
+//  const uint64_t mvp_bak = *(uint64_t*)m->mvp;
+    if( h->mb.i_me_method == X264_ME_UMH )
+    {
+        m->mvp[0] = x264_clip3( m->mvp[0], h->mb.mv_min[0], h->mb.mv_max[0] );
+        m->mvp[1] = x264_clip3( m->mvp[1], h->mb.mv_min[1], h->mb.mv_max[1] );
+    }
+
+    if( m->i_pixel == PIXEL_16x16 )
+    {
+        l->me16x8[0].cost = l->me16x8[1].cost =
+        l->me8x16[0].cost = l->me8x16[1].cost =
+        l->me8x8[0].cost  = l->me8x8[1].cost  =
+        l->me8x8[2].cost  = l->me8x8[3].cost  = COST_MAX;
+
+        bcost = COST_MAX;
+    }
+    else
+    {
+        bcost = m->cost;
+        bmx = ( m->mv[0] + 2 ) >> 2;
+        bmy = ( m->mv[1] + 2 ) >> 2;
+    }
+
+    pmx = x264_clip3( ( m->mvp[0] + 2 ) >> 2, mv_x_min, mv_x_max );
+    pmy = x264_clip3( ( m->mvp[1] + 2 ) >> 2, mv_y_min, mv_y_max );
+
+    if( m->i_pixel == PIXEL_16x16 || *(uint64_t*)m->mvp != *(uint64_t*)l->me16x16.mvp )
+    {
+        COST_MV( pmx, pmy );
+        /* I don't know why this helps */
+        bcost -= MVD( m->mvp, bmx, bmy );
+    }
+
+    /* try extra predictors if provided */
+    for( i = 0; i < i_mvc; i++ )
+    {
+        const int mx = x264_clip3( ( mvc[i][0] + 2 ) >> 2, mv_x_min, mv_x_max );
+        const int my = x264_clip3( ( mvc[i][1] + 2 ) >> 2, mv_y_min, mv_y_max );
+        if( mx != bmx || my != bmy )
+            COST_MV( mx, my );
+    }
+    
+    COST_MV( 0, 0 );
+
+    mv_x_max += 8;
+    mv_y_max += 8;
+    mv_x_min -= 8;
+    mv_y_min -= 8;
+
+    switch( h->mb.i_me_method )
+    {
+    case X264_ME_DIA:
+        /* diamond search, radius 1 */
+#define DIA1_ITER(mx, my)\
+        {\
+            omx = mx;\
+            omy = my;\
+            COST_MV( omx  , omy-1 );\
+            COST_MV( omx  , omy+1 );\
+            COST_MV( omx-1, omy   );\
+            COST_MV( omx+1, omy   );\
+        }
+
+        for( i = 0; i < i_me_range; i++ )
+        {
+            DIA1_ITER( bmx, bmy );
+            if( bmx == omx && bmy == omy )
+                break;
+        }
+        break;
+
+    case X264_ME_HEX:
+        /* hexagon search, radius 2 */
+#define HEX2_ITER(mx, my)\
+        {\
+            omx = mx;\
+            omy = my;\
+            COST_MV( omx-2, omy   );\
+            COST_MV( omx-1, omy+2 );\
+            COST_MV( omx+1, omy+2 );\
+            COST_MV( omx+2, omy   );\
+            COST_MV( omx+1, omy-2 );\
+            COST_MV( omx-1, omy-2 );\
+        }
+
+        for( i = 0; i < i_me_range/2; i++ )
+        {
+            HEX2_ITER( bmx, bmy );
+            if( bmx == omx && bmy == omy )
+                break;
+        }
+        /* square refine */
+        DIA1_ITER( bmx, bmy );
+        COST_MV( omx-1, omy-1 );
+        COST_MV( omx-1, omy+1 );
+        COST_MV( omx+1, omy-1 );
+        COST_MV( omx+1, omy+1 );
+        break;
+
+    case X264_ME_UMH:
+        /* Uneven-cross Multi-Hexagon-grid Search
+         * as in JM, except without early termination */
+
+        DIA1_ITER( pmx, pmy );
+        if( pmx || pmy )
+            DIA1_ITER( 0, 0 );
+        DIA1_ITER( bmx, bmy );
+
+        /* cross */
+        omx = bmx; omy = bmy;
+        for( i = 1; i < i_me_range; i+=2 )
+        {
+            if( omx + i <= mv_x_max )
+                COST_MV( omx + i, omy );
+            if( omx - i >= mv_x_min )
+                COST_MV( omx - i, omy );
+        }
+        for( i = 1; i < i_me_range/2; i+=2 )
+        {
+            if( omy + i <= mv_y_max )
+                COST_MV( omx, omy + i );
+            if( omy - i >= mv_y_min )
+                COST_MV( omx, omy - i );
+        }
+
+        /* 5x5 ESA */
+        omx = bmx; omy = bmy;
+        for( i = 0; i < 24; i++ )
+        {
+            static const int square2_x[24] = {1,1,0,-1,-1,-1, 0, 1, 2,2,2,2,1,0,-1,-2,-2,-2,-2,-2,-1, 0, 1, 2};
+            static const int square2_y[24] = {0,1,1, 1, 0,-1,-1,-1,-1,0,1,2,2,2, 2, 2, 1, 0,-1,-2,-2,-2,-2,-2};
+            COST_MV( omx + square2_x[i], omy + square2_y[i] );
+        }
+        /* hexagon grid */
+        omx = bmx; omy = bmy;
+        for( i = 1; i <= i_me_range/4; i++ )
+        {
+            int bounds_check = 4*i > X264_MIN4( mv_x_max-omx, mv_y_max-omy, omx-mv_x_min, omy-mv_y_min );
+            for( j = 0; j < 16; j++ )
+            {
+                static const int hex4_x[16] = {0,-2,-4,-4,-4,-4,-4,-2, 0, 2, 4, 4,4,4,4,2};
+                static const int hex4_y[16] = {4, 3, 2, 1, 0,-1,-2,-3,-4,-3,-2,-1,0,1,2,3};
+                int mx = omx + hex4_x[j]*i;
+                int my = omy + hex4_y[j]*i;
+                if( !bounds_check || ( mx >= mv_x_min && mx <= mv_x_max
+                                    && my >= mv_y_min && my <= mv_y_max ) )
+                    COST_MV( mx, my );
+            }
+        }
+
+        /* iterative search */
+        for( i = 0; i < i_me_range; i++ )
+        {
+            HEX2_ITER( bmx, bmy );
+            if( bmx == omx && bmy == omy )
+                break;
+        }
+        for( i = 0; i < i_me_range; i++ )
+        {
+            DIA1_ITER( bmx, bmy );
+            if( bmx == omx && bmy == omy )
+                break;
+        }
+        break;
+
+    case X264_ME_ESA:
+        {
+            const int min_x = X264_MAX( bmx - i_me_range, mv_x_min);
+            const int min_y = X264_MAX( bmy - i_me_range, mv_y_min);
+            const int max_x = X264_MIN( bmx + i_me_range, mv_x_max);
+            const int max_y = X264_MIN( bmy + i_me_range, mv_y_max);
+            for( omy = min_y; omy <= max_y; omy++ )
+                for( omx = min_x; omx <= max_x; omx++ )
+                {
+                    COST_MV( omx, omy );
+                }
+        }
+        break;
+    }
+
+    /* -> qpel mv */
+    m->mv[0] = bmx << 2;
+    m->mv[1] = bmy << 2;
+
+    /*
+#define SHIFT_QMV( m ) \
+    l->m.mv[0] <<= 2; \
+    l->m.mv[1] <<= 2;
+
+    SHIFT_QMV( me16x8[0] ); SHIFT_QMV( me16x8[1] );
+    SHIFT_QMV( me8x16[0] ); SHIFT_QMV( me8x16[1] );
+    SHIFT_QMV( me8x8[0] );  SHIFT_QMV( me8x8[1] );
+    SHIFT_QMV( me8x8[2] );  SHIFT_QMV( me8x8[3] );
+#undef SHIFT_QMV
+    */
+
+//  *(uint64_t*)m->mvp = mvp_bak;
+
+    /* subpel refine */
+    if( h->mb.i_subpel_refine >= 3 )
+    {
+        int hpel, qpel;
+
+        /* early termination (when examining multiple reference frames)
+         * FIXME: this can update fullpel_thresh even if the match
+         *        ref is rejected after subpel refinement */
+        if( p_fullpel_thresh )
+        {
+            if( (m->cost*7)>>3 > *p_fullpel_thresh )
+                return;
+            else if( m->cost < *p_fullpel_thresh )
+                *p_fullpel_thresh = m->cost;
+        }
+
+        hpel = subpel_iterations[h->mb.i_subpel_refine][2];
+        qpel = subpel_iterations[h->mb.i_subpel_refine][3];
+#if 0
+        refine_subpel_multipart( h, l, m, hpel, qpel );
+#else
+        m->cost_mv = MVD( mvp, bmx, bmy );
+        m->cost = h->pixf.mbcmp[m->i_pixel]( m->p_fenc[0], m->i_stride[0],
+                        &p_fref[bmy * m->i_stride[0] + bmx], m->i_stride[0] )
+                + m->cost_mv;
+        if( h->mb.b_chroma_me )
+        {
+            const int bw = x264_pixel_size[m->i_pixel].w;
+            const int bh = x264_pixel_size[m->i_pixel].h;
+            DECLARE_ALIGNED( uint8_t, pix[8*8*2], 16 );
+            h->mc.mc_chroma( m->p_fref[4], m->i_stride[1], pix, 8, m->mv[0], m->mv[1], bw/2, bh/2 );
+            h->mc.mc_chroma( m->p_fref[5], m->i_stride[2], pix+8*8, 8, m->mv[0], m->mv[1], bw/2, bh/2 );
+            m->cost += h->pixf.mbcmp[m->i_pixel+3]( m->p_fenc[1], m->i_stride[1], pix, 8 )
+                     + h->pixf.mbcmp[m->i_pixel+3]( m->p_fenc[2], m->i_stride[1], pix+8*8, 8 );
+        }
+        refine_subpel( h, m, hpel, qpel );
+#endif
+    }
+    else
+    {
+        /* compute the real cost */
+        m->cost_mv = MVD( mvp, bmx, bmy );
+        m->cost = h->pixf.mbcmp[m->i_pixel]( m->p_fenc[0], m->i_stride[0],
+                        &p_fref[bmy * m->i_stride[0] + bmx], m->i_stride[0] )
+                + m->cost_mv;
+        if( h->mb.b_chroma_me )
+        {
+            const int bw = x264_pixel_size[m->i_pixel].w;
+            const int bh = x264_pixel_size[m->i_pixel].h;
+            DECLARE_ALIGNED( uint8_t, pix[8*8*2], 16 );
+            h->mc.mc_chroma( m->p_fref[4], m->i_stride[1], pix, 8, m->mv[0], m->mv[1], bw/2, bh/2 );
+            h->mc.mc_chroma( m->p_fref[5], m->i_stride[2], pix+8*8, 8, m->mv[0], m->mv[1], bw/2, bh/2 );
+            m->cost += h->pixf.mbcmp[m->i_pixel+3]( m->p_fenc[1], m->i_stride[1], pix, 8 )
+                     + h->pixf.mbcmp[m->i_pixel+3]( m->p_fenc[2], m->i_stride[1], pix+8*8, 8 );
+        }
+        if( m->i_pixel == PIXEL_16x16 )
+        {
+            l->me16x16.scost =
+            l->me16x8[0].scost = l->me16x8[1].scost =
+            l->me8x16[0].scost = l->me8x16[1].scost =
+            l->me8x8[0].scost = l->me8x8[1].scost =
+            l->me8x8[2].scost = l->me8x8[3].scost = COST_MAX;
+            //debug
+            l->me16x16.smv[0] = l->me16x16.smv[1] =
+            l->me16x8[0].smv[0] = l->me16x8[1].smv[0] =
+            l->me8x16[0].smv[0] = l->me8x16[1].smv[0] =
+            l->me8x8[0].smv[0] = l->me8x8[1].smv[0] =
+            l->me8x8[2].smv[0] = l->me8x8[3].smv[0] =
+            l->me16x8[0].smv[1] = l->me16x8[1].smv[1] =
+            l->me8x16[0].smv[1] = l->me8x16[1].smv[1] =
+            l->me8x8[0].smv[1] = l->me8x8[1].smv[1] =
+            l->me8x8[2].smv[1] = l->me8x8[3].smv[1] = 0;
+        }
+    }
+
+#if BENCH
+    if(sad16_calls > sad_print && sad8_calls > 0)
+    {
+        while(sad16_calls > sad_print) sad_print = sad_print * 3/2 + 1;
+        fprintf(stderr, "cached 16: 0.%03ld =%9ld /%9ld  \n",
+                (sad16_calls-sad16_new)*1000/sad16_calls, sad16_calls-sad16_new, sad16_calls);
+        fprintf(stderr, "cached  8: 0.%03ld =%9ld /%9ld  \n",
+                (sad8_calls-sad8_new)*1000/sad8_calls, sad8_calls-sad8_new, sad8_calls);
+    }
+#endif
+}
+#undef COST_MV
+#undef MVD
+
+
+#define MVD( mvp, mx, my ) \
+    p_cost_mv[ (mx) - (mvp)[0] ] + p_cost_mv[ (my) - (mvp)[1] ]
+
+/* optimization: no chroma_me for smaller partitions? */
+static int cost_smv_multipart_16x16( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int mx, int my, int bcost )
+{
+    DECLARE_ALIGNED( uint8_t, pix[16*16], 16 );
+    const int16_t *p_cost_mv = m->p_cost_mv;
+    int sad[4];
+    int i8;
+    int stride = 16;
+    uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, 16, 16 );
+
+#define x (i8&1)
+#define y (i8>>1)
+    for( i8 = 0; i8 < 4; i8++ )
+        sad[i8] = h->pixf.mbcmp[PIXEL_8x8]( &m->p_fenc[0][8*(x+y*m->i_stride[0])], m->i_stride[0], src+8*(x+y*stride), stride );
+    if( h->mb.b_chroma_me )
+    {
+        h->mc.mc_chroma( m->p_fref[4], m->i_stride[1], pix,    8, mx, my, 8, 8 );
+        h->mc.mc_chroma( m->p_fref[5], m->i_stride[1], pix+64, 8, mx, my, 8, 8 );
+        for( i8 = 0; i8 < 4; i8++ )
+            sad[i8] += h->pixf.mbcmp[PIXEL_4x4]( &m->p_fenc[1][4*(x+y*m->i_stride[1])], m->i_stride[1], pix   +4*(x+y*8), 8 )
+                    +  h->pixf.mbcmp[PIXEL_4x4]( &m->p_fenc[2][4*(x+y*m->i_stride[1])], m->i_stride[1], pix+64+4*(x+y*8), 8 );
+    }
+#undef x
+#undef y
+
+    for( i8 = 0; i8 < 4; i8++ )
+        COPY3_IF_LT( l->me8x8[i8].scost, sad[i8] + MVD(l->me8x8[i8].mvp, mx, my),
+                     l->me8x8[i8].smv[0], mx, l->me8x8[i8].smv[1], my );
+
+    COPY3_IF_LT( l->me16x8[0].scost, sad[0]+sad[1] + MVD(l->me16x8[0].mvp, mx, my),
+                 l->me16x8[0].smv[0], mx, l->me16x8[0].smv[1], my );
+    COPY3_IF_LT( l->me16x8[1].scost, sad[2]+sad[3] + MVD(l->me16x8[1].mvp, mx, my),
+                 l->me16x8[1].smv[0], mx, l->me16x8[1].smv[1], my );
+    COPY3_IF_LT( l->me8x16[0].scost, sad[0]+sad[2] + MVD(l->me8x16[0].mvp, mx, my),
+                 l->me8x16[0].smv[0], mx, l->me8x16[0].smv[1], my );
+    COPY3_IF_LT( l->me8x16[1].scost, sad[1]+sad[3] + MVD(l->me8x16[1].mvp, mx, my),
+                 l->me8x16[1].smv[0], mx, l->me8x16[1].smv[1], my );
+
+    return sad[0]+sad[1]+sad[2]+sad[3] + MVD(m->mvp, mx, my);
+}
+
+static int cost_smv_multipart_8x8( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int mx, int my, int bcost )
+{
+    DECLARE_ALIGNED( uint8_t, pix[16*16], 16 );
+    const int16_t *p_cost_mv = m->p_cost_mv;
+    int sad;
+    int stride = 8;
+    const int i_pixel = m->i_pixel;
+    const int bw = x264_pixel_size[i_pixel].w;
+    const int bh = x264_pixel_size[i_pixel].h;
+    uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, bw, bh );
+
+    sad = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], m->i_stride[0], src, stride )
+        + MVD(m->mvp, mx, my);
+
+    if( h->mb.b_chroma_me && sad < bcost )
+    {
+        h->mc.mc_chroma( m->p_fref[4], m->i_stride[1], pix, 8, mx, my, bw/2, bh/2 );
+        sad += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], m->i_stride[1], pix, 8 );
+        if( sad < bcost )
+        {
+            h->mc.mc_chroma( m->p_fref[5], m->i_stride[1], pix, 8, mx, my, bw/2, bh/2 );
+            sad += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], m->i_stride[1], pix, 8 );
+        }
+    }
+
+    return sad;
+}
+
+static int (*const cost_smv_multipart[4])( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int mx, int my, int bcost ) =
+{
+    cost_smv_multipart_16x16, cost_smv_multipart_8x8, cost_smv_multipart_8x8, cost_smv_multipart_8x8
+};
+
+#define COST_MV( mx, my ) \
+{ \
+    int cost = cost_smv_multipart[ m->i_pixel ]( h, l, m, mx, my, bcost ); \
+    COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \
+}
+
+static void refine_subpel_multipart( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int hpel_iters, int qpel_iters )
+{
+    const int16_t *p_cost_mv = m->p_cost_mv;
+    int step, i;
+    int bmx, bmy, bcost; // FIXME gcc warning
+
+    if( m->i_pixel == PIXEL_16x16 )
+    {
+        l->me16x8[0].scost = l->me16x8[1].scost =
+        l->me8x16[0].scost = l->me8x16[1].scost =
+        l->me8x8[0].scost  = l->me8x8[1].scost  =
+        l->me8x8[2].scost  = l->me8x8[3].scost  =
+        bcost = COST_MAX;
+    }
+    else
+    {
+        bcost = COST_MAX;
+//      bcost = m->scost;
+//      bmx = m->smv[0];
+//      bmy = m->smv[1];
+    }
+
+    COST_MV( m->mv[0], m->mv[1] );
+
+    if( hpel_iters )
+        COST_MV( m->mvp[0], m->mvp[1] );
+
+    for( step = 2; step >= 1; step-- )
+    {
+	for( i = step>1 ? hpel_iters : qpel_iters; i > 0; i-- )
+        {
+            int omx = bmx;
+            int omy = bmy;
+            COST_MV( omx, omy - step );
+            COST_MV( omx, omy + step );
+            COST_MV( omx - step, omy );
+            COST_MV( omx + step, omy );
+            if( bmx == omx && bmy == omy )
+                break;
+	}
+    }
+
+    m->cost = bcost;
+    m->mv[0] = bmx;
+    m->mv[1] = bmy;
+    m->cost_mv = MVD( m->mvp, bmx, bmy );
+}
+#undef COST_MV
+#undef MVD
+
Index: encoder/me.h
===================================================================
--- encoder/me.h	(revision 27)
+++ encoder/me.h	(working copy)
@@ -26,6 +26,22 @@
 
 #define COST_MAX (1<<28)
 
+#define CACHE_FPEL_MAX 0xffff
+#define CACHE_FPEL_RADIUS 32
+#define CACHE_FPEL_STRIDE (2*CACHE_FPEL_RADIUS)
+#define CACHE_FPEL_SIZE (4*CACHE_FPEL_RADIUS*CACHE_FPEL_RADIUS)
+#define CACHE_FPEL_OFFS(x,y) ((x)+CACHE_FPEL_RADIUS+((y)+CACHE_FPEL_RADIUS)*CACHE_FPEL_STRIDE)
+#if 0
+#define CACHE_FPEL_IN_RANGE(x,y) ((x) >= -CACHE_FPEL_RADIUS && \
+                                  (x) <   CACHE_FPEL_RADIUS && \
+                                  (y) >= -CACHE_FPEL_RADIUS && \
+                                  (y) <   CACHE_FPEL_RADIUS)
+#else
+// assumes radius is a power of 2
+#define CACHE_FPEL_IN_RANGE(x,y) (!( ((x)+CACHE_FPEL_RADIUS) & ((-CACHE_FPEL_RADIUS)<<1) \
+                                  || ((y)+CACHE_FPEL_RADIUS) & ((-CACHE_FPEL_RADIUS)<<1) ))
+#endif
+
 typedef struct
 {
     /* input */
@@ -42,12 +58,17 @@
     int cost_mv;        /* lambda * nbits for the chosen mv */
     int cost;           /* satd + lambda * nbits */
     int mv[2];
+
+    /* preliminary subpel analysis, piggybacking on larger block searches */
+    int scost;
+    int smv[2];
 } x264_me_t;
 
 typedef struct
 {
     /* 16x16 */
     int i_ref;
+    uint8_t *p_cache_fpel; /* bitmap of which mvs have been tried */
     x264_me_t me16x16;
 
     /* 8x8 */
@@ -79,6 +100,7 @@
 void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
 static inline void x264_me_search( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc )
     { x264_me_search_ref( h, m, mvc, i_mvc, NULL ); }
+void x264_me_search_multipart( x264_t *h, x264_mb_analysis_list_t *l, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
 
 void x264_me_refine_qpel( x264_t *h, x264_me_t *m );
 
Index: encoder/analyse.c
===================================================================
--- encoder/analyse.c	(revision 27)
+++ encoder/analyse.c	(working copy)
@@ -156,6 +156,12 @@
 
 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 {
+    //FIXME move to common/macroblock.c
+    int i;
+    if( !h->mb.cache_fpel[0] )
+        for( i = 0; i < 3; i++ )
+            h->mb.cache_fpel[i] = x264_malloc( CACHE_FPEL_SIZE );
+
     memset( a, 0, sizeof( x264_mb_analysis_t ) );
 
     /* conduct the analysis using this lamda and QP */
@@ -641,6 +647,124 @@
     (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
     (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]];
 
+static void init_fenc( x264_t *h, x264_mb_analysis_t *a, x264_mb_analysis_list_t *l )
+{
+    LOAD_FENC( &l->me16x16, h->mb.pic.p_fenc, 0, 0 );
+    LOAD_FENC( &l->me16x8[0], h->mb.pic.p_fenc, 0, 0 );
+    LOAD_FENC( &l->me16x8[1], h->mb.pic.p_fenc, 0, 8 );
+    LOAD_FENC( &l->me8x16[0], h->mb.pic.p_fenc, 0, 0 );
+    LOAD_FENC( &l->me8x16[1], h->mb.pic.p_fenc, 8, 0 );
+    LOAD_FENC( &l->me8x8[0], h->mb.pic.p_fenc, 0, 0 );
+    LOAD_FENC( &l->me8x8[1], h->mb.pic.p_fenc, 8, 0 );
+    LOAD_FENC( &l->me8x8[2], h->mb.pic.p_fenc, 0, 8 );
+    LOAD_FENC( &l->me8x8[3], h->mb.pic.p_fenc, 8, 8 );
+    l->me16x16.i_pixel = PIXEL_16x16;
+    l->me16x8[0].i_pixel = l->me16x8[1].i_pixel = PIXEL_16x8;
+    l->me8x16[0].i_pixel = l->me8x16[1].i_pixel = PIXEL_8x16;
+    l->me8x8[0].i_pixel = l->me8x8[1].i_pixel =
+    l->me8x8[2].i_pixel = l->me8x8[3].i_pixel = PIXEL_8x8;
+    l->me16x16.p_cost_mv =
+    l->me16x8[0].p_cost_mv = l->me16x8[1].p_cost_mv =
+    l->me8x16[0].p_cost_mv = l->me8x16[1].p_cost_mv =
+    l->me8x8[0].p_cost_mv = l->me8x8[1].p_cost_mv =
+    l->me8x8[2].p_cost_mv = l->me8x8[3].p_cost_mv = a->p_cost_mv;
+}
+
+static void init_mvps( x264_t *h, x264_mb_analysis_list_t *l, int i_ref )
+{
+    int i;
+    l->i_ref = i_ref;
+    LOAD_HPELS( &l->me16x16, h->mb.pic.p_fref[0][i_ref], 0, 0 );
+    LOAD_HPELS( &l->me16x8[0], h->mb.pic.p_fref[0][i_ref], 0, 0 );
+    LOAD_HPELS( &l->me16x8[1], h->mb.pic.p_fref[0][i_ref], 0, 8 );
+    LOAD_HPELS( &l->me8x16[0], h->mb.pic.p_fref[0][i_ref], 0, 0 );
+    LOAD_HPELS( &l->me8x16[1], h->mb.pic.p_fref[0][i_ref], 8, 0 );
+    LOAD_HPELS( &l->me8x8[0], h->mb.pic.p_fref[0][i_ref], 0, 0 );
+    LOAD_HPELS( &l->me8x8[1], h->mb.pic.p_fref[0][i_ref], 8, 0 );
+    LOAD_HPELS( &l->me8x8[2], h->mb.pic.p_fref[0][i_ref], 0, 8 );
+    LOAD_HPELS( &l->me8x8[3], h->mb.pic.p_fref[0][i_ref], 8, 8 );
+
+    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, i_ref );
+    x264_mb_predict_mv_16x16( h, 0, i_ref, l->me16x16.mvp );
+
+    x264_mb_predict_mv( h, 0, 0, 4, l->me16x8[0].mvp );
+    x264_macroblock_cache_mv( h, 0, 0, 4, 2, 0, l->me16x8[0].mvp[0], l->me16x8[0].mvp[1] );
+    x264_mb_predict_mv( h, 0, 8, 4, l->me16x8[1].mvp );
+
+    x264_mb_predict_mv( h, 0, 0, 2, l->me8x16[0].mvp );
+    x264_macroblock_cache_mv( h, 0, 0, 2, 4, 0, l->me8x16[0].mvp[0], l->me8x16[0].mvp[1] );
+    x264_mb_predict_mv( h, 0, 4, 2, l->me8x16[1].mvp );
+
+    for( i = 0; i < 4; i++ )
+    {
+        int *mvp = l->me8x8[i].mvp;
+        x264_mb_predict_mv( h, 0, 4*i, 2, mvp );
+        x264_macroblock_cache_mv( h, 2*(i&1), i&2, 2, 2, 0, mvp[0], mvp[1] );
+    }
+}
+
+static void x264_mb_analyse_inter_p16x16_multipart( x264_t *h, x264_mb_analysis_t *a )
+{
+    x264_mb_analysis_list_t l;
+    int i_ref;
+    int mvc[4][2], i_mvc;
+    int i_fullpel_thresh = INT_MAX;
+    int *p_fullpel_thresh = h->i_ref0>1 ? &i_fullpel_thresh : NULL;
+    //FIXME cleanup
+    void *cache_best = h->mb.cache_fpel[0], *cache_cur = h->mb.cache_fpel[2], *cache_tmp;
+
+    init_fenc( h, a, &l );
+
+    a->l0.me16x16.cost = INT_MAX;
+    for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
+    {
+        const int i_ref_cost = a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+        i_fullpel_thresh -= i_ref_cost;
+
+        memset( cache_cur, 0, CACHE_FPEL_SIZE );
+        l.p_cache_fpel = cache_cur;
+        init_mvps( h, &l, i_ref );
+        x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
+        x264_me_search_multipart( h, &l, &l.me16x16, mvc, i_mvc, p_fullpel_thresh );
+
+        l.me16x16.cost += i_ref_cost;
+        i_fullpel_thresh += i_ref_cost;
+
+        if( l.me16x16.cost < a->l0.me16x16.cost )
+        {
+            a->l0 = l;
+            cache_tmp = cache_cur;
+            cache_cur = cache_best;
+            cache_best = cache_tmp;
+        }
+
+        /* save mv for predicting neighbors */
+        h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0] = l.me16x16.mv[0];
+        h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = l.me16x16.mv[1];
+    }
+
+    /* Set global ref, needed for all others modes */
+    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
+//  fprintf(stderr, "\np16x16 r%d: (%d,%d)\n", a->l0.i_ref, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1]);
+
+    h->mb.cache_fpel[0] = cache_best;
+    h->mb.cache_fpel[2] = cache_cur;
+
+    if( a->b_mbrd )
+    {
+        a->i_best_satd = a->l0.me16x16.cost;
+        h->mb.i_type = P_L0;
+        h->mb.i_partition = D_16x16;
+        x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
+        a->l0.me16x16.cost = x264_rd_cost_mb( h, a->i_lambda2 );
+    }
+    else
+    {
+        /* subtract ref cost, so we don't have to add it for the other P types */
+        a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
+    }
+}
+
 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
 {
     x264_me_t m;
@@ -708,6 +832,7 @@
     /* XXX Needed for x264_mb_predict_mv */
     h->mb.i_partition = D_8x8;
 
+    /* FIXME shouldn't need predictors from large block sizes */
     i_mvc = 1;
     mvc[0][0] = a->l0.me16x16.mv[0];
     mvc[0][1] = a->l0.me16x16.mv[1];
@@ -718,6 +843,9 @@
         const int x8 = i%2;
         const int y8 = i/2;
 
+//      fprintf(stderr, "p8x8[%d]: (%d,%d) -> (%d,%d) -> (%d,%d) ->",
+//              i, m->mvp[0], m->mvp[1], m->mv[0], m->mv[1], m->smv[0], m->smv[1]);
+
         m->i_pixel = PIXEL_8x8;
         m->p_cost_mv = a->p_cost_mv;
 
@@ -725,8 +853,10 @@
         LOAD_HPELS( m, p_fref, 8*x8, 8*y8 );
 
         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
-        x264_me_search( h, m, mvc, i_mvc );
+        x264_me_search_multipart( h, &a->l0, m, mvc, i_mvc, NULL );
+        //x264_me_search( h, m, mvc, i_mvc );
 
+//      fprintf(stderr, " (%d,%d)\n", m->mv[0], m->mv[1]);
         x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, m->mv[0], m->mv[1] );
 
         mvc[i_mvc][0] = m->mv[0];
@@ -764,6 +894,9 @@
     {
         x264_me_t *m = &a->l0.me16x8[i];
 
+//      fprintf(stderr, "p16x8[%d]: (%d,%d) -> (%d,%d) -> (%d,%d) ->",
+//              i, m->mvp[0], m->mvp[1], m->mv[0], m->mv[1], m->smv[0], m->smv[1]);
+
         m->i_pixel = PIXEL_16x8;
         m->p_cost_mv = a->p_cost_mv;
 
@@ -776,8 +909,10 @@
         mvc[1][1] = a->l0.me8x8[2*i+1].mv[1];
 
         x264_mb_predict_mv( h, 0, 8*i, 4, m->mvp );
-        x264_me_search( h, m, mvc, 2 );
+        x264_me_search_multipart( h, &a->l0, m, NULL, 0, NULL );
+        //x264_me_search( h, m, mvc, 2 );
 
+//      fprintf(stderr, " (%d,%d)\n", m->mv[0], m->mv[1]);
         x264_macroblock_cache_mv( h, 0, 2*i, 4, 2, 0, m->mv[0], m->mv[1] );
     }
 
@@ -805,6 +940,9 @@
     {
         x264_me_t *m = &a->l0.me8x16[i];
 
+//      fprintf(stderr, "p8x16[%d]: (%d,%d) -> (%d,%d) -> (%d,%d) ->",
+//              i, m->mvp[0], m->mvp[1], m->mv[0], m->mv[1], m->smv[0], m->smv[1]);
+
         m->i_pixel = PIXEL_8x16;
         m->p_cost_mv = a->p_cost_mv;
 
@@ -817,8 +955,10 @@
         mvc[1][1] = a->l0.me8x8[i+2].mv[1];
 
         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
-        x264_me_search( h, m, mvc, 2 );
+        x264_me_search_multipart( h, &a->l0, m, NULL, 0, NULL );
+        //x264_me_search( h, m, mvc, 2 );
 
+//      fprintf(stderr, " (%d,%d)\n", m->mv[0], m->mv[1]);
         x264_macroblock_cache_mv( h, 2*i, 0, 2, 4, 0, m->mv[0], m->mv[1] );
     }
 
@@ -1547,9 +1687,16 @@
 
             x264_mb_analyse_load_costs( h, &analysis );
 
-            x264_mb_analyse_inter_p16x16( h, &analysis );
             if( flags & X264_ANALYSE_PSUB16x16 )
+            {
+                x264_mb_analyse_inter_p16x16_multipart( h, &analysis );
+                //x264_mb_analyse_inter_p16x16( h, &analysis );
+                //x264_mb_analyse_inter_p16x8( h, &analysis );
+                //x264_mb_analyse_inter_p8x16( h, &analysis );
                 x264_mb_analyse_inter_p8x8( h, &analysis );
+            }
+            else
+                x264_mb_analyse_inter_p16x16( h, &analysis );
 
             /* Select best inter mode */
             i_type = P_L0;
Index: common/common.h
===================================================================
--- common/common.h	(revision 27)
+++ common/common.h	(working copy)
@@ -357,6 +357,9 @@
         int8_t  *skipbp;                    /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
         int8_t  *mb_transform_size;         /* transform_size_8x8_flag of each mb */
 
+        /* motion estimation state */
+        uint16_t *cache_fpel[3];            /* map mv -> 8x8 sad score. [L0, L1, tmp] */
+
         /* current value */
         int     i_type;
         int     i_partition;