diff --git a/encoder/me.c b/encoder/me.c index 25e1195..0f1a9e1 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -24,6 +24,7 @@ #include "common/common.h" #include "me.h" +#include /* presets selected from good points on the speed-vs-quality curve of several test videos * subpel_iters[i_subpel_refine] = { refine_hpel, refine_qpel, me_hpel, me_qpel } @@ -86,6 +87,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite #define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\ {\ + START_TIMER;\ uint8_t *pix_base = p_fref + omx + omy*m->i_stride[0];\ h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0],\ pix_base + (m0x) + (m0y)*m->i_stride[0],\ @@ -93,14 +95,37 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite pix_base + (m2x) + (m2y)*m->i_stride[0],\ pix_base + (m3x) + (m3y)*m->i_stride[0],\ m->i_stride[0], costs );\ - costs[0] += BITS_MVD( omx+(m0x), omy+(m0y) );\ - costs[1] += BITS_MVD( omx+(m1x), omy+(m1y) );\ - costs[2] += BITS_MVD( omx+(m2x), omy+(m2y) );\ - costs[3] += BITS_MVD( omx+(m3x), omy+(m3y) );\ - COPY3_IF_LT( bcost, costs[0], bmx, omx+(m0x), bmy, omy+(m0y) );\ - COPY3_IF_LT( bcost, costs[1], bmx, omx+(m1x), bmy, omy+(m1y) );\ - COPY3_IF_LT( bcost, costs[2], bmx, omx+(m2x), bmy, omy+(m2y) );\ - COPY3_IF_LT( bcost, costs[3], bmx, omx+(m3x), bmy, omy+(m3y) );\ + if( __builtin_constant_p(m0x) )\ + {\ + const int8_t deltas[8][2] = {{0,0}, {m0x,m0y}, {m1x,m1y}, {0,0}, {m2x,m2y}, {0,0}, {m3x,m3y}, {0,0}};\ + costs[0] = bcost + ((costs[0] + BITS_MVD( omx+(m0x), omy+(m0y) )) << 16);\ + costs[1] += BITS_MVD( omx+(m1x), omy+(m1y) ) + (-1<<16);\ + costs[2] += BITS_MVD( omx+(m2x), omy+(m2y) ) + (-1<<16);\ + costs[3] += BITS_MVD( omx+(m3x), omy+(m3y) ) + (-1<<16);\ + asm volatile(\ + "phminposuw %1, %%xmm0 \n"\ + "movd %%xmm0, %0 \n"\ + :"=r"(bcost)\ + :"m"(costs[0]), "m"(costs[1]), "m"(costs[2]), "m"(costs[3])\ + );\ + int d = bcost >> 16;\ + bcost &= 0xffff;\ + bmx += deltas[d][0];\ + bmy += deltas[d][1];\ + STOP_TIMER("x4_sse4");\ + }\ + else\ + {\ + costs[0] += BITS_MVD( omx+(m0x), omy+(m0y) );\ + costs[1] += BITS_MVD( omx+(m1x), omy+(m1y) );\ + costs[2] += BITS_MVD( omx+(m2x), omy+(m2y) );\ + costs[3] += BITS_MVD( omx+(m3x), omy+(m3y) );\ + COPY3_IF_LT( bcost, costs[0], bmx, omx+(m0x), bmy, omy+(m0y) );\ + COPY3_IF_LT( bcost, costs[1], bmx, omx+(m1x), bmy, omy+(m1y) );\ + COPY3_IF_LT( bcost, costs[2], bmx, omx+(m2x), bmy, omy+(m2y) );\ + COPY3_IF_LT( bcost, costs[3], bmx, omx+(m3x), bmy, omy+(m3y) );\ + STOP_TIMER("x4");\ + }\ } #define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\ @@ -167,7 +192,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int i = 0, j; int dir; - int costs[6]; + DECLARE_ALIGNED_16( int costs[6] ); int mv_x_min = h->mb.mv_min_fpel[0]; int mv_y_min = h->mb.mv_min_fpel[1];