diff --git a/common/common.c b/common/common.c index 924323a..d6c2991 100644 --- a/common/common.c +++ b/common/common.c @@ -405,7 +405,6 @@ void x264_param_apply_fastfirstpass( x264_param_t *param ) param->analyse.i_me_method = X264_ME_DIA; param->analyse.i_subpel_refine = X264_MIN( 2, param->analyse.i_subpel_refine ); param->analyse.i_trellis = 0; - param->analyse.b_fast_pskip = 1; } } diff --git a/common/common.h b/common/common.h index b8c6dfd..396e67f 100644 --- a/common/common.h +++ b/common/common.h @@ -626,7 +626,7 @@ struct x264_t /* pointer over mb of the references */ int i_fref[2]; - uint8_t *p_fref[2][32][4+2]; /* last: lN, lH, lV, lHV, cU, cV */ + uint8_t *p_fref[2][32][4+1]; /* last: lN, lH, lV, lHV, c */ uint8_t *p_fref_w[32]; /* weighted fullpel luma */ uint16_t *p_integral[2][16]; diff --git a/common/frame.c b/common/frame.c index abcfd14..087c353 100644 --- a/common/frame.c +++ b/common/frame.c @@ -44,20 +44,12 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) i_stride = ALIGN( i_width + 2*PADH, align ); i_lines = ALIGN( h->param.i_height, 16<param.b_interlaced ); - frame->i_plane = 3; - for( int i = 0; i < 3; i++ ) - { - frame->i_stride[i] = ALIGN( i_stride >> !!i, align ); - frame->i_width[i] = i_width >> !!i; - frame->i_lines[i] = i_lines >> !!i; - } - - luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv)); - chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv)); - for( int i = 1; i < 3; i++ ) + frame->i_plane = 2; + for( int i = 0; i < 2; i++ ) { - CHECKED_MALLOC( frame->buffer[i], chroma_plane_size ); - frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2; + frame->i_stride[i] = ALIGN( i_stride, align ); + frame->i_width[i] = i_width >> i; + frame->i_lines[i] = i_lines >> i; } for( int i = 0; i < h->param.i_bframe + 2; i++ ) @@ -83,6 +75,12 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) frame->orig = frame; + luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv)); + chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + i_padv)); + + CHECKED_MALLOC( frame->buffer[1], chroma_plane_size ); + frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * i_padv/2 + PADH; + /* all 4 luma planes allocated together, since the cacheline split code * requires them to be in-phase wrt cacheline alignment. */ if( h->param.analyse.i_subpel_refine && b_fdec ) @@ -235,34 +233,49 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src ) dst->param = src->param; dst->i_pic_struct = src->i_pic_struct; + uint8_t *plane[3]; + int stride[3]; for( int i = 0; i < 3; i++ ) { int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i; - uint8_t *plane = src->img.plane[s]; - int stride = src->img.i_stride[s]; - int width = h->param.i_width >> !!i; int height = h->param.i_height >> !!i; + plane[i] = src->img.plane[s]; + stride[i] = src->img.i_stride[s]; if( src->img.i_csp & X264_CSP_VFLIP ) { - plane += (height-1)*stride; - stride = -stride; + plane[i] += (height-1)*stride[i]; + stride[i] = -stride[i]; } - h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height ); } + + h->mc.plane_copy( dst->plane[0], dst->i_stride[0], plane[0], stride[0], + h->param.i_width, h->param.i_height ); + h->mc.plane_copy_interleave( dst->plane[1], dst->i_stride[1], plane[1], stride[1], plane[2], stride[2], + h->param.i_width>>1, h->param.i_height>>1 ); return 0; } -static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom ) +static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom, int b_chroma ) { #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride ) for( int y = 0; y < i_height; y++ ) { - /* left band */ - memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh ); - /* right band */ - memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh ); + if( b_chroma ) + { + for( int x = 0; x < i_padh; x+=2 ) + CP16( PPIXEL(-x, y), PPIXEL(0, y) ); + for( int x = 0; x < i_padh; x+=2 ) + CP16( PPIXEL(i_width+x, y), PPIXEL(i_width-2, y) ); + } + else + { + /* left band */ + memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh ); + /* right band */ + memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh ); + } } /* upper band */ if( b_pad_top ) @@ -283,9 +296,9 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e for( int i = 0; i < frame->i_plane; i++ ) { int stride = frame->i_stride[i]; - int width = 16*h->sps->i_mb_width >> !!i; + int width = 16*h->sps->i_mb_width; int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i; - int padh = PADH >> !!i; + int padh = PADH; int padv = PADV >> !!i; // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i); @@ -293,12 +306,12 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e height += 4 >> (!!i + h->sh.b_mbaff); if( h->sh.b_mbaff ) { - plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end ); - plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end ); + plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, i ); + plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, i ); } else { - plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end ); + plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, i ); } } } @@ -320,36 +333,40 @@ void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4; if( h->sh.b_mbaff ) { - plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end ); - plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end ); + plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, 0 ); + plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, 0 ); } else - plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end ); + plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, 0 ); } } void x264_frame_expand_border_lowres( x264_frame_t *frame ) { for( int i = 0; i < 4; i++ ) - plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 ); + plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1, 0 ); } void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame ) { for( int i = 0; i < frame->i_plane; i++ ) { - int i_subsample = i ? 1 : 0; - int i_width = h->param.i_width >> i_subsample; - int i_height = h->param.i_height >> i_subsample; - int i_padx = (h->sps->i_mb_width * 16 - h->param.i_width) >> i_subsample; - int i_pady = (h->sps->i_mb_height * 16 - h->param.i_height) >> i_subsample; + int i_width = h->param.i_width; + int i_height = h->param.i_height >> !!i; + int i_padx = (h->sps->i_mb_width * 16 - h->param.i_width); + int i_pady = (h->sps->i_mb_height * 16 - h->param.i_height) >> !!i; if( i_padx ) { for( int y = 0; y < i_height; y++ ) - memset( &frame->plane[i][y*frame->i_stride[i] + i_width], - frame->plane[i][y*frame->i_stride[i] + i_width - 1], - i_padx ); + if( i ) + for( int x = 0; x < i_padx; x+=2 ) + CP16( &frame->plane[i][y*frame->i_stride[i] + i_width + x], + &frame->plane[i][y*frame->i_stride[i] + i_width - 2] ); + else + memset( &frame->plane[i][y*frame->i_stride[i] + i_width], + frame->plane[i][y*frame->i_stride[i] + i_width - 1], + i_padx ); } if( i_pady ) { @@ -508,7 +525,8 @@ static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int pix += 2*ystride; continue; } - for( int d = 0; d < 2; d++ ) + for( int d = 0; d < 2; d++, pix += ystride-2 ) + for( int e = 0; e < 2; e++, pix++ ) { int p1 = pix[-2*xstride]; int p0 = pix[-1*xstride]; @@ -521,17 +539,16 @@ static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */ pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */ } - pix += ystride; } } } static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) { - deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 ); + deblock_chroma_c( pix, stride, 2, alpha, beta, tc0 ); } static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) { - deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 ); + deblock_chroma_c( pix, 2, stride, alpha, beta, tc0 ); } static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta ) @@ -586,9 +603,10 @@ static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int bet deblock_luma_intra_c( pix, 1, stride, alpha, beta ); } -static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta ) +static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int dir ) { - for( int d = 0; d < 8; d++ ) + for( int d = 0; d < (dir?16:8); d++, pix += ystride-2 ) + for( int e = 0; e < (dir?1:2); e++, pix++ ) { int p1 = pix[-2*xstride]; int p0 = pix[-1*xstride]; @@ -600,19 +618,18 @@ static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystrid pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */ pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */ } - pix += ystride; } } static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta ) { - deblock_chroma_intra_c( pix, stride, 1, alpha, beta ); + deblock_chroma_intra_c( pix, stride, 2, alpha, beta, 1 ); } static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta ) { - deblock_chroma_intra_c( pix, 1, stride, alpha, beta ); + deblock_chroma_intra_c( pix, 2, stride, alpha, beta, 0 ); } -static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter ) +static inline void deblock_edge( x264_t *h, uint8_t *pix, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter ) { int index_a = i_qp + h->sh.i_alpha_c0_offset; int alpha = alpha_table(index_a); @@ -627,12 +644,10 @@ static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_ tc[2] = tc0_table(index_a)[bS[2]] + b_chroma; tc[3] = tc0_table(index_a)[bS[3]] + b_chroma; - pf_inter( pix1, i_stride, alpha, beta, tc ); - if( b_chroma ) - pf_inter( pix2, i_stride, alpha, beta, tc ); + pf_inter( pix, i_stride, alpha, beta, tc ); } -static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra ) +static inline void deblock_edge_intra( x264_t *h, uint8_t *pix, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra ) { int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset); int beta = beta_table(i_qp + h->sh.i_beta_offset); @@ -640,9 +655,7 @@ static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, if( !alpha || !beta ) return; - pf_intra( pix1, i_stride, alpha, beta ); - if( b_chroma ) - pf_intra( pix2, i_stride, alpha, beta ); + pf_intra( pix, i_stride, alpha, beta ); } void x264_frame_deblock_row( x264_t *h, int mb_y ) @@ -672,13 +685,11 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) int i_qp = h->mb.qp[mb_xy]; int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4; uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x; - uint8_t *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x; - uint8_t *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x; + uint8_t *pixuv = h->fdec->plane[1] + 8*mb_y*strideuv + 16*mb_x; if( b_interlaced && (mb_y&1) ) { pixy -= 15*stridey; - pixu -= 7*strideuv; - pixv -= 7*strideuv; + pixuv -= 7*strideuv; } x264_prefetch_fenc( h, h->fdec, mb_x, mb_y ); @@ -693,14 +704,14 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) if( i_dir == 0 )\ {\ /* vertical edge */\ - deblock_edge##intra( h, pixy + 4*i_edge, NULL,\ + deblock_edge##intra( h, pixy + 4*i_edge,\ stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\ h->loopf.deblock_h_luma##intra );\ if( !(i_edge & 1) )\ {\ /* U/V planes */\ int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\ - deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\ + deblock_edge##intra( h, pixuv + 4*i_edge,\ stride2uv, bS, i_qpc, 1,\ h->loopf.deblock_h_chroma##intra );\ }\ @@ -708,14 +719,14 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) else\ {\ /* horizontal edge */\ - deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\ + deblock_edge##intra( h, pixy + 4*i_edge*stride2y,\ stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\ h->loopf.deblock_v_luma##intra );\ /* U/V planes */\ if( !(i_edge & 1) )\ {\ int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\ - deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\ + deblock_edge##intra( h, pixuv + 2*i_edge*stride2uv,\ stride2uv, bS, i_qpc, 1,\ h->loopf.deblock_v_chroma##intra );\ }\ @@ -831,21 +842,25 @@ void x264_frame_deblock( x264_t *h ) } #ifdef HAVE_MMX -void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); -void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); - void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_chroma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ); void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_v_chroma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ); #ifdef ARCH_X86 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); +// FIXME this wrapper has a significant cpu cost static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) { x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 ); @@ -885,22 +900,26 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) #ifdef HAVE_MMX if( cpu&X264_CPU_MMXEXT ) { - pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext; - pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext; - pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext; - pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext; #ifdef ARCH_X86 pf->deblock_v_luma = x264_deblock_v_luma_mmxext; pf->deblock_h_luma = x264_deblock_h_luma_mmxext; + pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext; + pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext; pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext; pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext; + pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext; + pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext; #endif if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) ) { pf->deblock_v_luma = x264_deblock_v_luma_sse2; pf->deblock_h_luma = x264_deblock_h_luma_sse2; + pf->deblock_v_chroma = x264_deblock_v_chroma_sse2; + pf->deblock_h_chroma = x264_deblock_h_chroma_sse2; pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2; pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2; + pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_sse2; + pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_sse2; } } #endif @@ -918,8 +937,8 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) { pf->deblock_v_luma = x264_deblock_v_luma_neon; pf->deblock_h_luma = x264_deblock_h_luma_neon; - pf->deblock_v_chroma = x264_deblock_v_chroma_neon; - pf->deblock_h_chroma = x264_deblock_h_chroma_neon; +// pf->deblock_v_chroma = x264_deblock_v_chroma_neon; +// pf->deblock_h_chroma = x264_deblock_h_chroma_neon; } #endif } diff --git a/common/macroblock.c b/common/macroblock.c index 0b9b903..51ef856 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -503,7 +503,8 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h if( h->mb.b_interlaced & i_ref ) mvy += (h->mb.i_mb_y & 1)*4 - 2; - h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, + h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], + &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1], mvx, mvy, 2*width, 2*height ); @@ -511,11 +512,6 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, &h->sh.weight[i_ref][1], height*2 ); - - h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, - h->mb.pic.p_fref[0][i_ref][5], h->mb.pic.i_stride[2], - mvx, mvy, 2*width, 2*height ); - if( h->sh.weight[i_ref][2].weightfn ) h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, @@ -536,13 +532,10 @@ static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int h if( h->mb.b_interlaced & i_ref ) mvy += (h->mb.i_mb_y & 1)*4 - 2; - h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, + h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], + &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1], mvx, mvy, 2*width, 2*height ); - - h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, - h->mb.pic.p_fref[1][i_ref][5], h->mb.pic.i_stride[2], - mvx, mvy, 2*width, 2*height ); } static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height ) @@ -573,16 +566,12 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int if( h->mb.b_interlaced & i_ref1 ) mvy1 += (h->mb.i_mb_y & 1)*4 - 2; - h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1], + h->mc.mc_chroma( tmp0, tmp0+8, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1], mvx0, mvy0, 2*width, 2*height ); - h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1], + h->mc.mc_chroma( tmp1, tmp1+8, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1], mvx1, mvy1, 2*width, 2*height ); h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); - h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][5], h->mb.pic.i_stride[2], - mvx0, mvy0, 2*width, 2*height ); - h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][5], h->mb.pic.i_stride[2], - mvx1, mvy1, 2*width, 2*height ); - h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); + h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight ); } void x264_mb_mc_8x8( x264_t *h, int i8 ) @@ -756,11 +745,11 @@ int x264_macroblock_cache_init( x264_t *h ) } for( int i = 0; i <= h->param.b_interlaced; i++ ) - for( int j = 0; j < 3; j++ ) + for( int j = 0; j < 2; j++ ) { /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */ - CHECKED_MALLOCZERO( h->mb.intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j ); - h->mb.intra_border_backup[i][j] += 8; + CHECKED_MALLOCZERO( h->mb.intra_border_backup[i][j], h->sps->i_mb_width*16+32 ); + h->mb.intra_border_backup[i][j] += 16; } return 0; @@ -769,8 +758,8 @@ fail: return -1; void x264_macroblock_cache_end( x264_t *h ) { for( int i = 0; i <= h->param.b_interlaced; i++ ) - for( int j = 0; j < 3; j++ ) - x264_free( h->mb.intra_border_backup[i][j] - 8 ); + for( int j = 0; j < 2; j++ ) + x264_free( h->mb.intra_border_backup[i][j] - 16 ); for( int i = 0; i < 2; i++ ) for( int j = 0; j < 32; j++ ) x264_free( h->mb.mvr[i][j] ); @@ -876,10 +865,10 @@ void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y ) { int stride_y = fenc->i_stride[0]; int stride_uv = fenc->i_stride[1]; - int off_y = 16 * (i_mb_x + i_mb_y * stride_y); - int off_uv = 8 * (i_mb_x + i_mb_y * stride_uv); + int off_y = 16 * i_mb_x + 16 * i_mb_y * stride_y; + int off_uv = 16 * i_mb_x + 8 * i_mb_y * stride_uv; h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y, - fenc->plane[1+(i_mb_x&1)]+off_uv, stride_uv, i_mb_x ); + fenc->plane[1]+off_uv, stride_uv, i_mb_x ); } static NOINLINE void copy_column8( uint8_t *dst, uint8_t *src ) @@ -891,34 +880,54 @@ static NOINLINE void copy_column8( uint8_t *dst, uint8_t *src ) static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x, int mb_y, int i ) { - const int w = (i == 0 ? 16 : 8); - const int i_stride = h->fdec->i_stride[!!i]; - const int i_stride2 = i_stride << h->mb.b_interlaced; - const int i_pix_offset = h->mb.b_interlaced - ? w * (mb_x + (mb_y&~1) * i_stride) + (mb_y&1) * i_stride - : w * (mb_x + mb_y * i_stride); - const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset]; - const uint8_t *intra_fdec = h->param.b_sliced_threads ? plane_fdec-i_stride2 : - &h->mb.intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16>>!!i]; + int w = (i ? 8 : 16); + int i_stride = h->fdec->i_stride[i]; + int i_stride2 = i_stride << h->mb.b_interlaced; + int i_pix_offset = h->mb.b_interlaced + ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride + : 16 * mb_x + w * mb_y * i_stride; + uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset]; + uint8_t *intra_fdec = h->param.b_sliced_threads ? plane_fdec-i_stride2 : + &h->mb.intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16]; int ref_pix_offset[2] = { i_pix_offset, i_pix_offset }; x264_frame_t **fref[2] = { h->fref0, h->fref1 }; if( h->mb.b_interlaced ) ref_pix_offset[1] += (1-2*(mb_y&1)) * i_stride; h->mb.pic.i_stride[i] = i_stride2; h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset]; - h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE, - h->mb.pic.p_fenc_plane[i], i_stride2, w ); - if( mb_y > 0 ) - memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 ); + if( i ) + { + h->mc.load_deinterleave_8x8x2( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2 ); + // FIXME intra_border_backup could be stored blockwise, thus eliminating this deinterleave + if( mb_y > 0 ) + h->mc.load_deinterleave_9x1x2( &h->mb.pic.p_fdec[1][-1-FDEC_STRIDE], &h->mb.pic.p_fdec[2][-1-FDEC_STRIDE], intra_fdec-2 ); + else + { + memset( &h->mb.pic.p_fdec[1][-1-FDEC_STRIDE], 0, 9 ); + memset( &h->mb.pic.p_fdec[2][-1-FDEC_STRIDE], 0, 9 ); + } + } else - memset( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], 0, w*3/2+1 ); + { + h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fenc_plane[0], i_stride2, 16 ); + if( mb_y > 0 ) + memcpy( &h->mb.pic.p_fdec[0][-1-FDEC_STRIDE], intra_fdec-1, 25 ); + else + memset( &h->mb.pic.p_fdec[0][-1-FDEC_STRIDE], 0, 25 ); + } if( h->mb.b_interlaced ) for( int j = 0; j < w; j++ ) - h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; + if( i ) + { + h->mb.pic.p_fdec[1][-1+j*FDEC_STRIDE] = plane_fdec[-2+j*i_stride2]; + h->mb.pic.p_fdec[2][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; + } + else + h->mb.pic.p_fdec[0][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; for( int j = 0; j < h->mb.pic.i_fref[0]; j++ ) { - h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &fref[0][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]]; - if( i == 0 ) + h->mb.pic.p_fref[0][j][i?4:0] = &fref[0][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]]; + if( !i ) { for( int k = 1; k < 4; k++ ) h->mb.pic.p_fref[0][j][k] = &fref[0][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]]; @@ -931,8 +940,8 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x if( h->sh.i_type == SLICE_TYPE_B ) for( int j = 0; j < h->mb.pic.i_fref[1]; j++ ) { - h->mb.pic.p_fref[1][j][i==0 ? 0:i+3] = &fref[1][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]]; - if( i == 0 ) + h->mb.pic.p_fref[1][j][i?4:0] = &fref[1][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]]; + if( !i ) for( int k = 1; k < 4; k++ ) h->mb.pic.p_fref[1][j][k] = &fref[1][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]]; } @@ -958,65 +967,66 @@ static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, i h->mb.i_mb_type_topleft = -1; h->mb.i_mb_type_topright = -1; + if( top >= 0 ) + { + h->mb.i_neighbour_frame |= MB_TOP; + h->mb.i_mb_top_xy = top; + if( top >= h->sh.i_first_mb ) + { + h->mb.i_neighbour |= MB_TOP; + h->mb.i_mb_type_top = h->mb.type[h->mb.i_mb_top_xy]; + + if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_top ) ) + h->mb.i_neighbour_intra |= MB_TOP; + } + } + if( mb_x > 0 ) { h->mb.i_neighbour_frame |= MB_LEFT; h->mb.i_mb_left_xy = h->mb.i_mb_xy - 1; - h->mb.i_mb_type_left = h->mb.type[h->mb.i_mb_left_xy]; if( h->mb.i_mb_xy > h->sh.i_first_mb ) { h->mb.i_neighbour |= MB_LEFT; + h->mb.i_mb_type_left = h->mb.type[h->mb.i_mb_left_xy]; if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_left ) ) h->mb.i_neighbour_intra |= MB_LEFT; } } - /* We can't predict from the previous threadslice since it hasn't been encoded yet. */ - if( (h->i_threadslice_start >> h->mb.b_interlaced) != (mb_y >> h->mb.b_interlaced) ) + if( mb_x > 0 && top - 1 >= 0 ) { - if( top >= 0 ) + h->mb.i_neighbour_frame |= MB_TOPLEFT; + h->mb.i_mb_topleft_xy = top - 1; + if( top - 1 >= h->sh.i_first_mb ) { - h->mb.i_neighbour_frame |= MB_TOP; - h->mb.i_mb_top_xy = top; - h->mb.i_mb_type_top = h->mb.type[h->mb.i_mb_top_xy]; - if( top >= h->sh.i_first_mb ) - { - h->mb.i_neighbour |= MB_TOP; - - if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_top ) ) - h->mb.i_neighbour_intra |= MB_TOP; - } - } - - if( mb_x > 0 && top - 1 >= 0 ) - { - h->mb.i_neighbour_frame |= MB_TOPLEFT; - h->mb.i_mb_topleft_xy = top - 1; + h->mb.i_neighbour |= MB_TOPLEFT; h->mb.i_mb_type_topright = h->mb.type[h->mb.i_mb_topleft_xy]; - if( top - 1 >= h->sh.i_first_mb ) - { - h->mb.i_neighbour |= MB_TOPLEFT; - if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_topleft ) ) - h->mb.i_neighbour_intra |= MB_TOPLEFT; - } + if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_topleft ) ) + h->mb.i_neighbour_intra |= MB_TOPLEFT; } + } - if( mb_x < h->sps->i_mb_width - 1 && top + 1 >= 0 ) + if( mb_x < h->sps->i_mb_width - 1 && top + 1 >= 0 ) + { + h->mb.i_neighbour_frame |= MB_TOPRIGHT; + h->mb.i_mb_topright_xy = top + 1; + if( top + 1 >= h->sh.i_first_mb ) { - h->mb.i_neighbour_frame |= MB_TOPRIGHT; - h->mb.i_mb_topright_xy = top + 1; + h->mb.i_neighbour |= MB_TOPRIGHT; h->mb.i_mb_type_topleft = h->mb.type[h->mb.i_mb_topright_xy]; - if( top + 1 >= h->sh.i_first_mb ) - { - h->mb.i_neighbour |= MB_TOPRIGHT; - if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_topright ) ) - h->mb.i_neighbour_intra |= MB_TOPRIGHT; - } + if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_topright ) ) + h->mb.i_neighbour_intra |= MB_TOPRIGHT; } } + + /* We can't predict from the previous threadslice since it hasn't been encoded yet, so + * only use left. */ + if( h->i_threadslice_start == mb_y ) + h->mb.i_neighbour_frame &= MB_LEFT; } void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y ) @@ -1122,7 +1132,6 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y ) /* load picture pointers */ x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 0 ); x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1 ); - x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 2 ); if( h->fdec->integral ) { @@ -1267,13 +1276,15 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y ) static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i ) { int w = i ? 8 : 16; - int i_stride = h->fdec->i_stride[!!i]; + int i_stride = h->fdec->i_stride[i]; int i_stride2 = i_stride << h->mb.b_interlaced; int i_pix_offset = h->mb.b_interlaced - ? w * (h->mb.i_mb_x + (h->mb.i_mb_y&~1) * i_stride) + (h->mb.i_mb_y&1) * i_stride - : w * (h->mb.i_mb_x + h->mb.i_mb_y * i_stride); - h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( &h->fdec->plane[i][i_pix_offset], i_stride2, - h->mb.pic.p_fdec[i], FDEC_STRIDE, w ); + ? 16 * h->mb.i_mb_x + w * (h->mb.i_mb_y&~1) * i_stride + (h->mb.i_mb_y&1) * i_stride + : 16 * h->mb.i_mb_x + w * h->mb.i_mb_y * i_stride; + if( i ) + h->mc.store_interleave_8x8x2( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2] ); + else + h->mc.copy[PIXEL_16x16]( &h->fdec->plane[0][i_pix_offset], i_stride2, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 ); } void x264_macroblock_cache_save( x264_t *h ) @@ -1292,7 +1303,6 @@ void x264_macroblock_cache_save( x264_t *h ) x264_macroblock_store_pic( h, 0 ); x264_macroblock_store_pic( h, 1 ); - x264_macroblock_store_pic( h, 2 ); x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y ); diff --git a/common/mc.c b/common/mc.c index 859e5fc..e5a1957 100644 --- a/common/mc.c +++ b/common/mc.c @@ -260,7 +260,7 @@ static uint8_t *get_ref( uint8_t *dst, int *i_dst_stride, } /* full chroma mc (ie until 1/8 pixel)*/ -static void mc_chroma( uint8_t *dst, int i_dst_stride, +static void mc_chroma( uint8_t *dstu, uint8_t *dstv, int i_dst_stride, uint8_t *src, int i_src_stride, int mvx, int mvy, int i_width, int i_height ) @@ -274,14 +274,20 @@ static void mc_chroma( uint8_t *dst, int i_dst_stride, int cC = (8-d8x)*d8y; int cD = d8x *d8y; - src += (mvy >> 3) * i_src_stride + (mvx >> 3); + src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2; srcp = &src[i_src_stride]; for( int y = 0; y < i_height; y++ ) { for( int x = 0; x < i_width; x++ ) - dst[x] = ( cA*src[x] + cB*src[x+1] + cC*srcp[x] + cD*srcp[x+1] + 32 ) >> 6; - dst += i_dst_stride; + { + dstu[x] = ( cA*src[2*x] + cB*src[2*x+2] + + cC*srcp[2*x] + cD*srcp[2*x+2] + 32 ) >> 6; + dstv[x] = ( cA*src[2*x+1] + cB*src[2*x+3] + + cC*srcp[2*x+1] + cD*srcp[2*x+3] + 32 ) >> 6; + } + dstu += i_dst_stride; + dstv += i_dst_stride; src = srcp; srcp += i_src_stride; } @@ -297,7 +303,7 @@ MC_COPY( 8 ) MC_COPY( 4 ) void x264_plane_copy_c( uint8_t *dst, int i_dst, - uint8_t *src, int i_src, int w, int h) + uint8_t *src, int i_src, int w, int h ) { while( h-- ) { @@ -307,6 +313,45 @@ void x264_plane_copy_c( uint8_t *dst, int i_dst, } } +void x264_plane_copy_interleave_c( uint8_t *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ) +{ + for( int y=0; ycopy[PIXEL_8x8] = mc_copy_w8; pf->copy[PIXEL_4x4] = mc_copy_w4; + pf->store_interleave_8x8x2 = store_interleave_8x8x2; + pf->load_deinterleave_8x8x2 = load_deinterleave_8x8x2; + pf->load_deinterleave_9x1x2 = load_deinterleave_9x1x2; + pf->plane_copy = x264_plane_copy_c; + pf->plane_copy_interleave = x264_plane_copy_interleave_c; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c; + pf->hpel_filter = hpel_filter; pf->prefetch_fenc = prefetch_fenc_null; diff --git a/common/mc.h b/common/mc.h index 68bba48..7610575 100644 --- a/common/mc.h +++ b/common/mc.h @@ -68,7 +68,7 @@ typedef struct /* mc_chroma may write up to 2 bytes of garbage to the right of dst, * so it must be run from left to right. */ - void (*mc_chroma)(uint8_t *dst, int i_dst, uint8_t *src, int i_src, + void (*mc_chroma)(uint8_t *dstu, uint8_t *dstv, int i_dst, uint8_t *src, int i_src, int mvx, int mvy, int i_width, int i_height ); @@ -78,8 +78,18 @@ typedef struct void (*copy[7])( uint8_t *dst, int, uint8_t *src, int, int i_height ); void (*copy_16x16_unaligned)( uint8_t *dst, int, uint8_t *src, int, int i_height ); + void (*store_interleave_8x8x2)( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv ); + void (*load_deinterleave_8x8x2)( uint8_t *dst, uint8_t *src, int i_src ); + void (*load_deinterleave_9x1x2)( uint8_t *dstu, uint8_t *dstv, uint8_t *src ); + void (*plane_copy)( uint8_t *dst, int i_dst, - uint8_t *src, int i_src, int w, int h); + uint8_t *src, int i_src, int w, int h ); + void (*plane_copy_interleave)( uint8_t *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ); + void (*plane_copy_deinterleave)( uint8_t *dstu, int i_dstu, + uint8_t *dstv, int i_dstv, + uint8_t *src, int i_src, int w, int h ); void (*hpel_filter)( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int i_stride, int i_width, int i_height, int16_t *buf ); diff --git a/common/pixel.c b/common/pixel.c index 449a752..f38d48e 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -96,9 +96,9 @@ PIXEL_SSD_C( x264_pixel_ssd_8x4, 8, 4 ) PIXEL_SSD_C( x264_pixel_ssd_4x8, 4, 8 ) PIXEL_SSD_C( x264_pixel_ssd_4x4, 4, 4 ) -int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height ) +uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height ) { - int64_t i_ssd = 0; + uint64_t i_ssd = 0; int y; int align = !(((intptr_t)pix1 | (intptr_t)pix2 | i_pix1 | i_pix2) & 15); @@ -114,13 +114,8 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1 SSD(PIXEL_8x16); } if( y < i_height-7 ) - { - int x = 0; - for( ; x < i_width-15; x += 16 ) - SSD(PIXEL_16x8); - for( ; x < i_width-7; x += 8 ) + for( int x = 0; x < i_width-7; x += 8 ) SSD(PIXEL_8x8); - } #undef SSD #define SSD1 { int d = pix1[y*i_pix1+x] - pix2[y*i_pix2+x]; i_ssd += d*d; } @@ -141,6 +136,31 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1 return i_ssd; } +static uint64_t pixel_ssd_nv12_core( uint8_t *pixuv1, int stride1, uint8_t *pixuv2, int stride2, int width, int height ) +{ + uint32_t ssd_u=0, ssd_v=0; + for( int y = 0; y < height; y++, pixuv1+=stride1, pixuv2+=stride2 ) + for( int x = 0; x < width; x++ ) + { + int du = pixuv1[2*x] - pixuv2[2*x]; + int dv = pixuv1[2*x+1] - pixuv2[2*x+1]; + ssd_u += du*du; + ssd_v += dv*dv; + } + return ssd_u + ((uint64_t)ssd_v<<32); +} + +// SSD in uint32 (i.e. packing two into uint64) can potentially overflow on +// image widths >= 11008 (or 6604 if interlaced), since this is called on blocks +// of height up to 12 (resp 20). Though it will probably take significantly more +// than that at sane distortion levels. +uint64_t x264_pixel_ssd_nv12( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height ) +{ + uint64_t ssd = pf->ssd_nv12_core( pix1, i_pix1, pix2, i_pix2, i_width&~7, i_height ); + if( i_width&7 ) + ssd += pixel_ssd_nv12_core( pix1+(i_width&~7), i_pix1, pix2+(i_width&~7), i_pix2, i_width&7, i_height ); + return ssd; +} /**************************************************************************** * pixel_var_wxh @@ -637,6 +657,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->var[PIXEL_16x16] = x264_pixel_var_16x16; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8; + pixf->ssd_nv12_core = pixel_ssd_nv12_core; pixf->ssim_4x4x2_core = ssim_4x4x2_core; pixf->ssim_end4 = ssim_end4; pixf->var2_8x8 = pixel_var2_8x8; @@ -660,6 +681,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT_ADS( _mmxext ); pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmxext; + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmxext; #ifdef ARCH_X86 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext; @@ -705,6 +727,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT5( ssd, _sse2slow ); INIT2_NAME( sad_aligned, sad, _sse2_aligned ); pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2; + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2; pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; diff --git a/common/pixel.h b/common/pixel.h index 1102642..7747e32 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -78,6 +78,8 @@ typedef struct uint64_t (*var[4])( uint8_t *pix, int stride ); uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride ); + uint64_t (*ssd_nv12_core)( uint8_t *pixuv1, int stride1, + uint8_t *pixuv2, int stride2, int width, int height ); void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1, const uint8_t *pix2, int stride2, int sums[2][4] ); float (*ssim_end4)( int sum0[5][4], int sum1[5][4], int width ); @@ -110,7 +112,8 @@ typedef struct } x264_pixel_function_t; void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ); -int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height ); +uint64_t x264_pixel_ssd_nv12( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height ); +uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height ); float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height, void *buf ); #endif diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index 00d0418..9912dfa 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -21,6 +21,7 @@ ;***************************************************************************** %include "x86inc.asm" +%include "x86util.asm" SECTION_RODATA pb_00: times 16 db 0x00 @@ -35,74 +36,123 @@ SECTION .text [base], [base+stride], [base+stride*2], [base3], \ [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] -; in: 8 rows of 4 bytes in %1..%8 +%define PASS8ROWS(base, base3, stride, stride3, offset) \ + PASS8ROWS(base+offset, base3+offset, stride, stride3) + +; in: 8 rows of 4 bytes in %4..%11 ; out: 4 rows of 8 bytes in m0..m3 -%macro TRANSPOSE4x8_LOAD 8 - movd m0, %1 - movd m2, %2 - movd m1, %3 - movd m3, %4 - punpcklbw m0, m2 - punpcklbw m1, m3 - movq m2, m0 - punpcklwd m0, m1 - punpckhwd m2, m1 - - movd m4, %5 - movd m6, %6 - movd m5, %7 - movd m7, %8 - punpcklbw m4, m6 - punpcklbw m5, m7 - movq m6, m4 - punpcklwd m4, m5 - punpckhwd m6, m5 - - movq m1, m0 - movq m3, m2 - punpckldq m0, m4 - punpckhdq m1, m4 - punpckldq m2, m6 - punpckhdq m3, m6 +%macro TRANSPOSE4x8_LOAD 11 + movh m0, %4 + movh m2, %5 + movh m1, %6 + movh m3, %7 + punpckl%1 m0, m2 + punpckl%1 m1, m3 + mova m2, m0 + punpckl%2 m0, m1 + punpckh%2 m2, m1 + + movh m4, %8 + movh m6, %9 + movh m5, %10 + movh m7, %11 + punpckl%1 m4, m6 + punpckl%1 m5, m7 + mova m6, m4 + punpckl%2 m4, m5 + punpckh%2 m6, m5 + + mova m1, m0 + mova m3, m2 + punpckl%3 m0, m4 + punpckh%3 m1, m4 + punpckl%3 m2, m6 + punpckh%3 m3, m6 %endmacro ; in: 4 rows of 8 bytes in m0..m3 ; out: 8 rows of 4 bytes in %1..%8 -%macro TRANSPOSE8x4_STORE 8 - movq m4, m0 - movq m5, m1 - movq m6, m2 +%macro TRANSPOSE8x4B_STORE 8 + mova m4, m0 + mova m5, m1 + mova m6, m2 punpckhdq m4, m4 punpckhdq m5, m5 punpckhdq m6, m6 punpcklbw m0, m1 punpcklbw m2, m3 - movq m1, m0 + mova m1, m0 punpcklwd m0, m2 punpckhwd m1, m2 - movd %1, m0 + movh %1, m0 punpckhdq m0, m0 - movd %2, m0 - movd %3, m1 + movh %2, m0 + movh %3, m1 punpckhdq m1, m1 - movd %4, m1 + movh %4, m1 punpckhdq m3, m3 punpcklbw m4, m5 punpcklbw m6, m3 - movq m5, m4 + mova m5, m4 punpcklwd m4, m6 punpckhwd m5, m6 - movd %5, m4 + movh %5, m4 punpckhdq m4, m4 - movd %6, m4 - movd %7, m5 + movh %6, m4 + movh %7, m5 punpckhdq m5, m5 - movd %8, m5 + movh %8, m5 +%endmacro + +%macro TRANSPOSE4x8B_LOAD 8 + TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 +%endmacro + +%macro TRANSPOSE4x8W_LOAD 8 +%if mmsize==16 + TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8 +%else + SWAP 1, 4, 2, 3 + mova m0, [t5] + mova m1, [t5+r1] + mova m2, [t5+r1*2] + mova m3, [t5+t6] + TRANSPOSE4x4W 0, 1, 2, 3, 4 +%endif +%endmacro + +%macro TRANSPOSE8x2W_STORE 8 + mova m0, m1 + punpcklwd m1, m2 + punpckhwd m0, m2 +%if mmsize==8 + movd %1, m1 + movd %3, m0 + psrlq m1, 32 + psrlq m0, 32 + movd %2, m1 + movd %4, m0 +%else + movd %1, m1 + movd %5, m0 + psrldq m1, 4 + psrldq m0, 4 + movd %2, m1 + movd %6, m0 + psrldq m1, 4 + psrldq m0, 4 + movd %3, m1 + movd %7, m0 + psrldq m1, 4 + psrldq m0, 4 + movd %4, m1 + movd %8, m0 +%endif %endmacro -%macro SBUTTERFLY 4 +%macro SBUTTERFLY0 4 movq %4, %2 punpckl%1 %2, %3 punpckh%1 %4, %3 @@ -111,6 +161,7 @@ SECTION .text ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] %macro TRANSPOSE6x8_MEM 9 + RESET_MM_PERMUTATION movq m0, %1 movq m1, %2 movq m2, %3 @@ -118,30 +169,32 @@ SECTION .text movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY bw, m0, m1, m7 - SBUTTERFLY bw, m2, m3, m1 - SBUTTERFLY bw, m4, m5, m3 - movq [%9+0x10], m1 - SBUTTERFLY bw, m6, %8, m5 - SBUTTERFLY wd, m0, m2, m1 - SBUTTERFLY wd, m4, m6, m2 + SBUTTERFLY bw, 0, 1, 7 + SBUTTERFLY bw, 2, 3, 7 + SBUTTERFLY bw, 4, 5, 7 + movq [%9+0x10], m3 + SBUTTERFLY0 bw, m6, %8, m7 + SBUTTERFLY wd, 0, 2, 3 + SBUTTERFLY wd, 4, 6, 3 punpckhdq m0, m4 movq [%9+0x00], m0 - SBUTTERFLY wd, m7, [%9+0x10], m6 - SBUTTERFLY wd, m3, m5, m4 - SBUTTERFLY dq, m7, m3, m0 - SBUTTERFLY dq, m1, m2, m5 - punpckldq m6, m4 - movq [%9+0x10], m1 - movq [%9+0x20], m5 - movq [%9+0x30], m7 - movq [%9+0x40], m0 - movq [%9+0x50], m6 + SBUTTERFLY0 wd, m1, [%9+0x10], m3 + SBUTTERFLY wd, 5, 7, 0 + SBUTTERFLY dq, 1, 5, 0 + SBUTTERFLY dq, 2, 6, 0 + punpckldq m3, m7 + movq [%9+0x10], m2 + movq [%9+0x20], m6 + movq [%9+0x30], m1 + movq [%9+0x40], m5 + movq [%9+0x50], m3 + RESET_MM_PERMUTATION %endmacro ; in: 8 rows of 8 in %1..%8 ; out: 8 rows of 8 in %9..%16 %macro TRANSPOSE8x8_MEM 16 + RESET_MM_PERMUTATION movq m0, %1 movq m1, %2 movq m2, %3 @@ -149,29 +202,30 @@ SECTION .text movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY bw, m0, m1, m7 - SBUTTERFLY bw, m2, m3, m1 - SBUTTERFLY bw, m4, m5, m3 - SBUTTERFLY bw, m6, %8, m5 - movq %9, m3 - SBUTTERFLY wd, m0, m2, m3 - SBUTTERFLY wd, m4, m6, m2 - SBUTTERFLY wd, m7, m1, m6 - movq %11, m2 - movq m2, %9 - SBUTTERFLY wd, m2, m5, m1 - SBUTTERFLY dq, m0, m4, m5 - SBUTTERFLY dq, m7, m2, m4 + SBUTTERFLY bw, 0, 1, 7 + SBUTTERFLY bw, 2, 3, 7 + SBUTTERFLY bw, 4, 5, 7 + SBUTTERFLY0 bw, m6, %8, m7 + movq %9, m5 + SBUTTERFLY wd, 0, 2, 5 + SBUTTERFLY wd, 4, 6, 5 + SBUTTERFLY wd, 1, 3, 5 + movq %11, m6 + movq m6, %9 + SBUTTERFLY wd, 6, 7, 5 + SBUTTERFLY dq, 0, 4, 5 + SBUTTERFLY dq, 1, 6, 5 movq %9, m0 - movq %10, m5 - movq %13, m7 - movq %14, m4 - SBUTTERFLY dq, m3, %11, m0 - SBUTTERFLY dq, m6, m1, m5 - movq %11, m3 + movq %10, m4 + movq %13, m1 + movq %14, m6 + SBUTTERFLY0 dq, m2, %11, m0 + SBUTTERFLY dq, 3, 7, 4 + movq %11, m2 movq %12, m0 - movq %15, m6 - movq %16, m5 + movq %15, m3 + movq %16, m7 + RESET_MM_PERMUTATION %endmacro ; out: %4 = |%1-%2|>%3 @@ -360,7 +414,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7 movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) shl r10, 3 sub r6, r10 @@ -370,7 +424,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7 movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) %ifdef WIN64 add rsp, 0x98 @@ -479,7 +533,7 @@ cglobal x264_deblock_h_luma_%1, 0,5 movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) lea r0, [r0+r3*8] lea r1, [r1+r3*8] @@ -487,7 +541,7 @@ cglobal x264_deblock_h_luma_%1, 0,5 movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) ADD esp, pad RET @@ -762,117 +816,152 @@ DEBLOCK_LUMA_INTRA mmxext, v8 -INIT_MMX - %macro CHROMA_V_START 0 dec r2d ; alpha-1 dec r3d ; beta-1 mov t5, r0 sub t5, r1 sub t5, r1 +%if mmsize==8 + mov dword r0m, 2 +.skip_prologue: +%endif %endmacro %macro CHROMA_H_START 0 dec r2d dec r3d - sub r0, 2 + sub r0, 4 lea t6, [r1*3] mov t5, r0 add r0, t6 +%if mmsize==8 + mov dword r0m, 2 +.skip_prologue: +%endif +%endmacro + +%macro CHROMA_V_LOOP 1 +%if mmsize==8 + add r0, 8 + add t5, 8 +%if %1 + add r4, 2 +%endif + dec dword r0m + jg .skip_prologue +%endif +%endmacro + +%macro CHROMA_H_LOOP 1 +%if mmsize==8 + lea r0, [r0+r1*4] + lea t5, [t5+r1*4] +%if %1 + add r4, 2 +%endif + dec dword r0m + jg .skip_prologue +%endif %endmacro %define t5 r5 %define t6 r6 +%macro DEBLOCK_CHROMA 1 ;----------------------------------------------------------------------------- ; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_mmxext, 5,6 +cglobal x264_deblock_v_chroma_%1, 5,6,8 CHROMA_V_START - movq m0, [t5] - movq m1, [t5+r1] - movq m2, [r0] - movq m3, [r0+r1] - call chroma_inter_body_mmxext - movq [t5+r1], m1 - movq [r0], m2 + mova m0, [t5] + mova m1, [t5+r1] + mova m2, [r0] + mova m3, [r0+r1] + call chroma_inter_body_%1 + mova [t5+r1], m1 + mova [r0], m2 + CHROMA_V_LOOP 1 RET ;----------------------------------------------------------------------------- ; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_mmxext, 5,7 -%ifdef ARCH_X86_64 - %define buf0 [rsp-24] - %define buf1 [rsp-16] -%else - %define buf0 r0m - %define buf1 r2m -%endif +cglobal x264_deblock_h_chroma_%1, 5,7,8 CHROMA_H_START - TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) - movq buf0, m0 - movq buf1, m3 - call chroma_inter_body_mmxext - movq m0, buf0 - movq m3, buf1 - TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) + call chroma_inter_body_%1 + TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) + CHROMA_H_LOOP 1 RET ALIGN 16 -chroma_inter_body_mmxext: +RESET_MM_PERMUTATION +chroma_inter_body_%1: LOAD_MASK r2d, r3d movd m6, [r4] ; tc0 punpcklbw m6, m6 + punpcklbw m6, m6 pand m7, m6 DEBLOCK_P0_Q0 ret +%endmacro ; DEBLOCK_CHROMA +INIT_XMM +DEBLOCK_CHROMA sse2 +%ifndef ARCH_X86_64 +INIT_MMX +DEBLOCK_CHROMA mmxext +%endif ; in: %1=p0 %2=p1 %3=q1 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 %macro CHROMA_INTRA_P0 3 - movq m4, %1 + mova m4, %1 pxor m4, %3 pand m4, [pb_01] ; m4 = (p0^q1)&1 pavgb %1, %3 psubusb %1, m4 - pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) + pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) %endmacro %define t5 r4 %define t6 r5 +%macro DEBLOCK_CHROMA_INTRA 1 ;----------------------------------------------------------------------------- ; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_intra_mmxext, 4,5 +cglobal x264_deblock_v_chroma_intra_%1, 4,5,8 CHROMA_V_START - movq m0, [t5] - movq m1, [t5+r1] - movq m2, [r0] - movq m3, [r0+r1] - call chroma_intra_body_mmxext - movq [t5+r1], m1 - movq [r0], m2 + mova m0, [t5] + mova m1, [t5+r1] + mova m2, [r0] + mova m3, [r0+r1] + call chroma_intra_body_%1 + mova [t5+r1], m1 + mova [r0], m2 + CHROMA_V_LOOP 0 RET ;----------------------------------------------------------------------------- ; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_intra_mmxext, 4,6 +cglobal x264_deblock_h_chroma_intra_%1, 4,6,8 CHROMA_H_START - TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) - call chroma_intra_body_mmxext - TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) + call chroma_intra_body_%1 + TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) + CHROMA_H_LOOP 0 RET ALIGN 16 -chroma_intra_body_mmxext: +RESET_MM_PERMUTATION +chroma_intra_body_%1: LOAD_MASK r2d, r3d - movq m5, m1 - movq m6, m2 + mova m5, m1 + mova m6, m2 CHROMA_INTRA_P0 m1, m0, m3 CHROMA_INTRA_P0 m2, m3, m0 psubb m1, m5 @@ -882,3 +971,11 @@ chroma_intra_body_mmxext: paddb m1, m5 paddb m2, m6 ret +%endmacro ; DEBLOCK_CHROMA_INTRA + +INIT_XMM +DEBLOCK_CHROMA_INTRA sse2 +%ifndef ARCH_X86_64 +INIT_MMX +DEBLOCK_CHROMA_INTRA mmxext +%endif diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index 6dbb5fc..8063927 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -28,12 +28,16 @@ SECTION_RODATA 32 -ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0 -pw_1: times 8 dw 1 +ch_shuffle: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 +ch_shuffle_adj: times 8 db 0 + times 8 db 2 + times 8 db 4 + times 8 db 6 pw_4: times 8 dw 4 pw_8: times 8 dw 8 pw_32: times 8 dw 32 pw_64: times 8 dw 64 +pw_ff: times 8 dw 0xff sw_64: dd 64 SECTION .text @@ -892,28 +896,27 @@ COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa ;----------------------------------------------------------------------------- %ifdef ARCH_X86_64 cglobal x264_prefetch_fenc_mmxext, 5,5 + and r4d, 3 mov eax, r4d - and eax, 3 - imul eax, r1d - lea r0, [r0+rax*4+64] + imul r4d, r1d + lea r0, [r0+r4*4+64] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] prefetcht0 [r0] prefetcht0 [r0+r1] - and r4d, 6 - imul r4d, r3d - lea r2, [r2+r4+64] + imul eax, r3d + lea r2, [r2+rax*2+64] prefetcht0 [r2] prefetcht0 [r2+r3] RET %else -cglobal x264_prefetch_fenc_mmxext - mov r2, [esp+20] - mov r1, [esp+8] - mov r0, [esp+4] +cglobal x264_prefetch_fenc_mmxext, 0,3 + mov r2, r4m + mov r1, r1m + mov r0, r0m and r2, 3 imul r2, r1 lea r0, [r0+r2*4+64] @@ -923,12 +926,12 @@ cglobal x264_prefetch_fenc_mmxext prefetcht0 [r0] prefetcht0 [r0+r1] - mov r2, [esp+20] - mov r1, [esp+16] - mov r0, [esp+12] - and r2, 6 + mov r2, r4m + mov r1, r3m + mov r0, r2m + and r2, 3 imul r2, r1 - lea r0, [r0+r2+64] + lea r0, [r0+r2*2+64] prefetcht0 [r0] prefetcht0 [r0+r1] ret @@ -959,237 +962,373 @@ cglobal x264_prefetch_ref_mmxext, 3,3 ; chroma MC ;============================================================================= - %define t0 rax %ifdef ARCH_X86_64 - %define t1 r10 + DECLARE_REG_TMP 10,11,6 %else - %define t1 r1 + DECLARE_REG_TMP 0,1,2 %endif %macro MC_CHROMA_START 0 - movifnidn r2, r2mp - movifnidn r3d, r3m + movifnidn r3, r3mp movifnidn r4d, r4m movifnidn r5d, r5m - mov t0d, r5d - mov t1d, r4d + movifnidn t2d, r6m + mov t0d, t2d + mov t1d, r5d sar t0d, 3 sar t1d, 3 - imul t0d, r3d - add t0d, t1d + imul t0d, r4d + lea t0d, [t0+t1*2] movsxdifnidn t0, t0d - add r2, t0 ; src += (dx>>3) + (dy>>3) * src_stride + add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride +%endmacro + +%macro UNPACK_UNALIGNED_MEM 3 + punpcklwd %1, %3 +%endmacro + +%macro UNPACK_UNALIGNED_LOAD 3 + movh %2, %3 + punpcklwd %1, %2 %endmacro ;----------------------------------------------------------------------------- -; void x264_mc_chroma_mmxext( uint8_t *dst, int dst_stride, +; void x264_mc_chroma_mmxext( uint8_t *dstu, uint8_t *dstv, int dst_stride, ; uint8_t *src, int src_stride, ; int dx, int dy, ; int width, int height ) ;----------------------------------------------------------------------------- -%macro MC_CHROMA 1-2 0 -cglobal x264_mc_chroma_%1 -%if mmsize == 16 - cmp dword r6m, 4 - jle x264_mc_chroma_mmxext -%endif - PROLOGUE 0,6,%2 +; FIXME broken on win64 +%macro MC_CHROMA 1 +cglobal x264_mc_chroma_%1, 0,6 MC_CHROMA_START - pxor m3, m3 - and r4d, 7 ; dx &= 7 + and r5d, 7 +%ifdef ARCH_X86_64 jz .mc1dy - and r5d, 7 ; dy &= 7 - jz .mc1dx - - movd m5, r4d - movd m6, r5d - SPLATW m5, m5 ; m5 = dx - SPLATW m6, m6 ; m6 = dy - - mova m4, [pw_8] - mova m0, m4 - psubw m4, m5 ; m4 = 8-dx - psubw m0, m6 ; m0 = 8-dy - - mova m7, m5 - pmullw m5, m0 ; m5 = dx*(8-dy) = cB - pmullw m7, m6 ; m7 = dx*dy = cD - pmullw m6, m4 ; m6 = (8-dx)*dy = cC - pmullw m4, m0 ; m4 = (8-dx)*(8-dy) = cA - - mov r4d, r7m +%endif + and t2d, 7 %ifdef ARCH_X86_64 - mov r10, r0 - mov r11, r2 + jz .mc1dx +%endif + shl r5d, 16 + add t2d, r5d + mov t0d, t2d + shl t2d, 8 + sub t2d, t0d + add t2d, 0x80008 ; (x<<24) + ((8-x)<<16) + (y<<8) + (8-y) + cmp dword r7m, 4 +%if mmsize==8 +.skip_prologue: %else - mov r0, r0mp - mov r1, r1m - mov r5, r2 + jl x264_mc_chroma_mmxext %+ .skip_prologue +%ifdef WIN64 + SPILL_XMM 9 %endif - -.loop2d: - movh m1, [r2+r3] - movh m0, [r2] - punpcklbw m1, m3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4 - punpcklbw m0, m3 - pmullw m1, m6 ; 2nd line * cC - pmullw m0, m4 ; 1st line * cA - paddw m0, m1 ; m0 <- result - - movh m2, [r2+1] - movh m1, [r2+r3+1] - punpcklbw m2, m3 - punpcklbw m1, m3 - - paddw m0, [pw_32] - - pmullw m2, m5 ; line * cB - pmullw m1, m7 ; line * cD +%endif + movd m5, t2d + movifnidn r0, r0mp + movifnidn r1, r1mp + movifnidn r2d, r2m + movifnidn r5d, r8m + pxor m6, m6 + punpcklbw m5, m6 +%if mmsize==8 + pshufw m7, m5, 0xee + pshufw m6, m5, 0x00 + pshufw m5, m5, 0x55 + jge .width4 +%else + pshufd m7, m5, 0x55 + punpcklwd m5, m5 + pshufd m6, m5, 0x00 + pshufd m5, m5, 0x55 + jg .width8 +%endif + movu m0, [r3] + UNPACK_UNALIGNED m0, m1, [r3+2] + mova m1, m0 + pand m0, [pw_ff] + psrlw m1, 8 + pmaddwd m0, m7 + pmaddwd m1, m7 + packssdw m0, m1 + SWAP m3, m0 +ALIGN 4 +.loop2: + movu m0, [r3+r4] + UNPACK_UNALIGNED m0, m1, [r3+r4+2] + pmullw m3, m6 + mova m1, m0 + pand m0, [pw_ff] + psrlw m1, 8 + pmaddwd m0, m7 + pmaddwd m1, m7 + mova m2, [pw_32] + packssdw m0, m1 + paddw m2, m3 + mova m3, m0 + pmullw m0, m5 paddw m0, m2 - paddw m0, m1 psrlw m0, 6 + packuswb m0, m0 + movd [r0], m0 +%if mmsize==8 + psrlq m0, 16 +%else + psrldq m0, 4 +%endif + movd [r1], m0 + add r3, r4 + add r0, r2 + add r1, r2 + dec r5d + jg .loop2 + REP_RET - packuswb m0, m3 ; 00 00 00 00 px1 px2 px3 px4 - movh [r0], m0 - - add r2, r3 - add r0, r1 ; dst_stride - dec r4d - jnz .loop2d - -%if mmsize == 8 - sub dword r6m, 8 - jnz .finish ; width != 8 so assume 4 +%if mmsize==8 +.width4: %ifdef ARCH_X86_64 - lea r0, [r10+4] ; dst - lea r2, [r11+4] ; src + mov t0, r0 + mov t1, r1 + mov t2, r3 + %define multy0 [rsp-8] + mova multy0, m5 %else - mov r0, r0mp - lea r2, [r5+4] - add r0, 4 + mov r3m, r3 + %define multy0 r4m + mova multy0, m5 +%endif +%else +.width8: +%ifdef ARCH_X86_64 + %define multy0 m8 + SWAP m8, m5 +%else + %define multy0 r0m + mova multy0, m5 +%endif +%endif +.loopx: + movu m0, [r3] + movu m1, [r3+mmsize/2] + UNPACK_UNALIGNED m0, m2, [r3+2] + UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2] + mova m2, m0 + mova m3, m1 + pand m0, [pw_ff] + pand m1, [pw_ff] + psrlw m2, 8 + psrlw m3, 8 + pmaddwd m0, m7 + pmaddwd m2, m7 + pmaddwd m1, m7 + pmaddwd m3, m7 + packssdw m0, m2 + packssdw m1, m3 + SWAP m4, m0 + SWAP m5, m1 + add r3, r4 +ALIGN 4 +.loop4: + movu m0, [r3] + movu m1, [r3+mmsize/2] + UNPACK_UNALIGNED m0, m2, [r3+2] + UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2] + mova m2, m0 + mova m3, m1 + pand m0, [pw_ff] + pand m1, [pw_ff] + psrlw m2, 8 + psrlw m3, 8 + pmaddwd m0, m7 + pmaddwd m2, m7 + pmaddwd m1, m7 + pmaddwd m3, m7 + packssdw m0, m2 + packssdw m1, m3 + pmullw m4, m6 + pmullw m5, m6 + mova m2, [pw_32] + mova m3, m2 + paddw m2, m4 + paddw m3, m5 + mova m4, m0 + mova m5, m1 + pmullw m0, multy0 + pmullw m1, multy0 + paddw m0, m2 + paddw m1, m3 + psrlw m0, 6 + psrlw m1, 6 + packuswb m0, m1 +%if mmsize==8 + pshufw m1, m0, 0x8 + pshufw m0, m0, 0xd + movd [r0], m1 + movd [r1], m0 +%else + pshufd m0, m0, 0xd8 + movq [r0], m0 + movhps [r1], m0 %endif - mov r4d, r7m ; height - jmp .loop2d + add r3, r4 + add r0, r2 + add r1, r2 + dec r5d + jg .loop4 +%if mmsize!=8 + REP_RET %else + sub dword r7m, 4 + jg .width8 REP_RET -%endif ; mmsize +.width8: +%ifdef ARCH_X86_64 + lea r3, [t2+8] + lea r0, [t0+4] + lea r1, [t1+4] +%else + mov r3, r3m + mov r0, r0m + mov r1, r1m + add r3, 8 + add r0, 4 + add r1, 4 +%endif + mov r5d, r8m + jmp .loopx +%endif +%ifdef ARCH_X86_64 ; too many regs for x86_32 .mc1dy: - and r5d, 7 - movd m6, r5d - mov r5, r3 ; pel_offset = dx ? 1 : src_stride + and t2d, 7 + movd m7, t2d + mov r6d, r4d ; pel_offset = dx ? 2 : src_stride jmp .mc1d .mc1dx: - movd m6, r4d - mov r5d, 1 + movd m7, r5d + mov r6d, 2 .mc1d: - mova m5, [pw_8] - SPLATW m6, m6 - mova m7, [pw_4] - psubw m5, m6 - movifnidn r0, r0mp - movifnidn r1d, r1m - mov r4d, r7m -%if mmsize == 8 - cmp dword r6m, 8 - je .loop1d_w8 + mova m6, [pw_8] + SPLATW m7, m7 + psubw m6, m7 + movifnidn r0, r0mp + movifnidn r1, r1mp + movifnidn r2d, r2m + movifnidn r5d, r8m + cmp dword r7m, 4 + jg .mc1d_w8 + mov r10, r2 + mov r11, r4 +%if mmsize!=8 + shr r5d, 1 %endif - .loop1d_w4: - movh m0, [r2+r5] - movh m1, [r2] - punpcklbw m0, m3 - punpcklbw m1, m3 - pmullw m0, m6 - pmullw m1, m5 - paddw m0, m7 - paddw m0, m1 - psrlw m0, 3 - packuswb m0, m3 - movh [r0], m0 - add r2, r3 - add r0, r1 - dec r4d - jnz .loop1d_w4 -.finish: - REP_RET - -%if mmsize == 8 -.loop1d_w8: - movu m0, [r2+r5] - mova m1, [r2] + movq m0, [r3] + movq m1, [r3+r6] +%if mmsize!=8 + add r3, r11 + movhps m0, [r3] + movhps m1, [r3+r6] +%endif mova m2, m0 - mova m4, m1 - punpcklbw m0, m3 - punpcklbw m1, m3 - punpckhbw m2, m3 - punpckhbw m4, m3 + mova m3, m1 + pand m0, [pw_ff] + pand m1, [pw_ff] + psrlw m2, 8 + psrlw m3, 8 pmullw m0, m6 - pmullw m1, m5 + pmullw m1, m7 pmullw m2, m6 - pmullw m4, m5 - paddw m0, m7 - paddw m2, m7 + pmullw m3, m7 + paddw m0, [pw_4] + paddw m2, [pw_4] paddw m0, m1 - paddw m2, m4 + paddw m2, m3 psrlw m0, 3 psrlw m2, 3 packuswb m0, m2 - mova [r0], m0 - add r2, r3 - add r0, r1 - dec r4d - jnz .loop1d_w8 +%if mmsize==8 + xchg r4, r11 + xchg r2, r10 + movd [r0], m0 + psrlq m0, 32 + movd [r1], m0 +%else + movhlps m1, m0 + movd [r0], m0 + movd [r1], m1 + add r0, r10 + add r1, r10 + psrldq m0, 4 + psrldq m1, 4 + movd [r0], m0 + movd [r1], m1 +%endif + add r3, r4 + add r0, r2 + add r1, r2 + dec r5d + jg .loop1d_w4 REP_RET -%endif ; mmsize +.mc1d_w8: + sub r2, 4 + sub r4, 8 + mov r10, 4 + mov r11, 8 +%if mmsize==8 + shl r5d, 1 +%endif + jmp .loop1d_w4 +%endif ; ARCH_X86_64 %endmacro ; MC_CHROMA -INIT_MMX -MC_CHROMA mmxext -INIT_XMM -MC_CHROMA sse2, 8 -%macro MC_CHROMA_SSSE3 2 -INIT_MMX -cglobal x264_mc_chroma_ssse3%1, 0,6,%2 +%macro MC_CHROMA_SSSE3 0-1 +INIT_XMM +cglobal x264_mc_chroma_ssse3%1, 0,6,9 MC_CHROMA_START - and r4d, 7 and r5d, 7 - mov t0d, r4d + and t2d, 7 + mov t0d, r5d shl t0d, 8 - sub t0d, r4d - mov r4d, 8 + sub t0d, r5d + mov r5d, 8 add t0d, 8 - sub r4d, r5d - imul r5d, t0d ; (x*255+8)*y - imul r4d, t0d ; (x*255+8)*(8-y) - cmp dword r6m, 4 - jg .width8 - mova m5, [pw_32] - movd m6, r5d - movd m7, r4d - movifnidn r0, r0mp - movifnidn r1d, r1m - movifnidn r4d, r7m - SPLATW m6, m6 - SPLATW m7, m7 - mov r5, r2 - and r2, ~3 - and r5, 3 + sub r5d, t2d + imul t2d, t0d ; (x*255+8)*y + imul r5d, t0d ; (x*255+8)*(8-y) + movd m6, t2d + movd m7, r5d +%ifidn %1, _cache64 + mov t0d, r3d + and t0d, 7 %ifdef PIC - lea r11, [ch_shuffle] - movu m5, [r11 + r5*2] + lea t1, [ch_shuffle_adj] + movddup m5, [t1 + t0*4] +%else + movddup m5, [ch_shuffle_adj + t0*4] +%endif + paddb m5, [ch_shuffle] + and r3, ~7 %else - movu m5, [ch_shuffle + r5*2] + mova m5, [ch_shuffle] %endif - movu m0, [r2] + movifnidn r0, r0mp + movifnidn r1, r1mp + movifnidn r2d, r2m + movifnidn r5d, r8m + SPLATW m6, m6 + SPLATW m7, m7 + cmp dword r7m, 4 + jg .width8 + movu m0, [r3] pshufb m0, m5 .loop4: - movu m1, [r2+r3] + movu m1, [r3+r4] pshufb m1, m5 - movu m3, [r2+2*r3] + movu m3, [r3+r4*2] pshufb m3, m5 - lea r2, [r2+2*r3] mova m2, m1 mova m4, m3 pmaddubsw m0, m7 @@ -1203,109 +1342,90 @@ cglobal x264_mc_chroma_ssse3%1, 0,6,%2 mova m0, m4 psrlw m1, 6 psrlw m3, 6 - packuswb m1, m1 - packuswb m3, m3 - movh [r0], m1 - movh [r0+r1], m3 - sub r4d, 2 - lea r0, [r0+2*r1] + packuswb m1, m3 + movhlps m3, m1 + movd [r0], m1 + movd [r0+r2], m3 + psrldq m1, 4 + psrldq m3, 4 + movd [r1], m1 + movd [r1+r2], m3 + lea r3, [r3+r4*2] + lea r0, [r0+r2*2] + lea r1, [r1+r2*2] + sub r5d, 2 jg .loop4 REP_RET -INIT_XMM .width8: - movd m6, r5d - movd m7, r4d - movifnidn r0, r0mp - movifnidn r1d, r1m - movifnidn r4d, r7m - SPLATW m6, m6 - SPLATW m7, m7 -%ifidn %1, _cache64 - mov r5, r2 - and r5, 0x3f - cmp r5, 0x38 - jge .split -%endif - mova m5, [pw_32] - movh m0, [r2] - movh m1, [r2+1] - punpcklbw m0, m1 -.loop8: - movh m1, [r2+1*r3] - movh m2, [r2+1*r3+1] - movh m3, [r2+2*r3] - movh m4, [r2+2*r3+1] - punpcklbw m1, m2 - punpcklbw m3, m4 - lea r2, [r2+2*r3] - mova m2, m1 - mova m4, m3 - pmaddubsw m0, m7 - pmaddubsw m1, m6 - pmaddubsw m2, m7 - pmaddubsw m3, m6 - paddw m0, m5 - paddw m2, m5 - paddw m1, m0 - paddw m3, m2 - mova m0, m4 - psrlw m1, 6 - psrlw m3, 6 - packuswb m1, m3 - movh [r0], m1 - movhps [r0+r1], m1 - sub r4d, 2 - lea r0, [r0+2*r1] - jg .loop8 - REP_RET -%ifidn %1, _cache64 -.split: - and r2, ~7 - and r5, 7 -%ifdef PIC - lea r11, [ch_shuffle] - movu m5, [r11 + r5*2] -%else - movu m5, [ch_shuffle + r5*2] -%endif - movu m0, [r2] + movu m0, [r3] pshufb m0, m5 + movu m1, [r3+8] + pshufb m1, m5 %ifdef ARCH_X86_64 - mova m8, [pw_32] - %define round m8 + SWAP m8, m6 + %define mult1 m8 %else - %define round [pw_32] + mova r0m, m6 + %define mult1 r0m %endif -.splitloop8: - movu m1, [r2+r3] - pshufb m1, m5 - movu m3, [r2+2*r3] +.loop8: + movu m2, [r3+r4] + pshufb m2, m5 + movu m3, [r3+r4+8] pshufb m3, m5 - lea r2, [r2+2*r3] - mova m2, m1 - mova m4, m3 + mova m4, m2 + mova m6, m3 pmaddubsw m0, m7 - pmaddubsw m1, m6 - pmaddubsw m2, m7 - pmaddubsw m3, m6 - paddw m0, round - paddw m2, round - paddw m1, m0 - paddw m3, m2 - mova m0, m4 + pmaddubsw m1, m7 + pmaddubsw m2, mult1 + pmaddubsw m3, mult1 + paddw m0, [pw_32] + paddw m1, [pw_32] + paddw m0, m2 + paddw m1, m3 + psrlw m0, 6 psrlw m1, 6 + packuswb m0, m1 + pshufd m0, m0, 0xd8 + movq [r0], m0 + movhps [r1], m0 + + movu m2, [r3+r4*2] + pshufb m2, m5 + movu m3, [r3+r4*2+8] + pshufb m3, m5 + mova m0, m2 + mova m1, m3 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pmaddubsw m2, mult1 + pmaddubsw m3, mult1 + paddw m4, [pw_32] + paddw m6, [pw_32] + paddw m2, m4 + paddw m3, m6 + psrlw m2, 6 psrlw m3, 6 - packuswb m1, m3 - movh [r0], m1 - movhps [r0+r1], m1 - sub r4d, 2 - lea r0, [r0+2*r1] - jg .splitloop8 + packuswb m2, m3 + pshufd m2, m2, 0xd8 + movq [r0+r2], m2 + movhps [r1+r2], m2 + lea r3, [r3+r4*2] + lea r0, [r0+r2*2] + lea r1, [r1+r2*2] + sub r5d, 2 + jg .loop8 REP_RET -%endif -; mc_chroma 1d ssse3 is negligibly faster, and definitely not worth the extra code size %endmacro -MC_CHROMA_SSSE3 , 8 -MC_CHROMA_SSSE3 _cache64, 9 +INIT_MMX +%define UNPACK_UNALIGNED UNPACK_UNALIGNED_MEM +MC_CHROMA mmxext +INIT_XMM +MC_CHROMA sse2_misalign +%define UNPACK_UNALIGNED UNPACK_UNALIGNED_LOAD +MC_CHROMA sse2 +MC_CHROMA_SSSE3 +MC_CHROMA_SSSE3 _cache64 + diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index 20ef5d7..1dfad35 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -32,10 +32,12 @@ filt_mul20: times 16 db 20 filt_mul15: times 8 db 1, -5 filt_mul51: times 8 db -5, 1 hpel_shuf: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 pw_1: times 8 dw 1 pw_16: times 8 dw 16 pw_32: times 8 dw 32 +pw_00ff: times 8 dw 0xff pd_128: times 4 dd 128 SECTION .text @@ -170,7 +172,7 @@ cglobal x264_hpel_filter_v_%1, 5,6,%2 mova [r2+r4*2], m1 mova [r2+r4*2+mmsize], m4 FILT_PACK m1, m4, 5, m7 - movnt [r0+r4], m1 + movnta [r0+r4], m1 add r1, mmsize add r5, mmsize add r4, mmsize @@ -692,6 +694,206 @@ cglobal x264_plane_copy_core_mmxext, 6,7 RET +%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint + movq m0, [%2] +%if mmsize==16 +%if %4 + punpcklbw m0, [%3] +%else + movq m1, [%3] + punpcklbw m0, m1 +%endif + mov%5a [%1], m0 +%else + movq m1, [%3] + mova m2, m0 + punpcklbw m0, m1 + punpckhbw m2, m1 + mov%5a [%1], m0 + mov%5a [%1+8], m2 +%endif +%endmacro + +%macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, cpu, shuffle constant +%if mmsize==16 +%ifidn %5, ssse3 + mova m0, [%3] + pshufb m0, %6 +%else + mova m0, [%3] + mova m1, m0 + pand m0, [pw_00ff] + psrlw m1, 8 + packuswb m0, m1 +%endif +%if %4 + mova [%1], m0 +%else + movq [%1], m0 + movhps [%2], m0 +%endif +%else + mova m0, [%3] + mova m1, [%3+8] + mova m2, m0 + mova m3, m1 + pand m0, [pw_00ff] + pand m1, [pw_00ff] + psrlw m2, 8 + psrlw m3, 8 + packuswb m0, m1 + packuswb m2, m3 + mova [%1], m0 + mova [%2], m2 +%endif +%endmacro + +%macro PLANE_INTERLEAVE 1 +;----------------------------------------------------------------------------- +; void x264_plane_copy_interleave_core_mmxext( uint8_t *dst, int i_dst, +; uint8_t *srcu, int i_srcu, +; uint8_t *srcv, int i_srcv, int w, int h ) +;----------------------------------------------------------------------------- +; assumes i_dst and w are multiples of 16, and i_dst>2*w +cglobal x264_plane_copy_interleave_core_%1, 6,7 + mov r6d, r6m + movsxdifnidn r1, r1d + movsxdifnidn r3, r3d + movsxdifnidn r5, r5d + lea r0, [r0+r6*2] + add r2, r6 + add r4, r6 +%ifdef ARCH_X86_64 + DECLARE_REG_TMP 10,11 +%else + DECLARE_REG_TMP 1,3 +%endif + mov t0d, r7m + mov t1d, r1d + shr t1d, 1 + sub t1d, r6d +.loopy: + mov r6d, r6m + neg r6 +.prefetch: + prefetchnta [r2+r6] + prefetchnta [r4+r6] + add r6, 64 + jl .prefetch + mov r6d, r6m + neg r6 +.loopx: + INTERLEAVE r0+r6*2, r2+r6, r4+r6, 0, nt + INTERLEAVE r0+r6*2+16, r2+r6+8, r4+r6+8, 0, nt + add r6, 16 + jl .loopx +.pad: +%if mmsize==8 + movntq [r0+r6*2], m0 + movntq [r0+r6*2+8], m0 + movntq [r0+r6*2+16], m0 + movntq [r0+r6*2+24], m0 +%else + movntdq [r0+r6*2], m0 + movntdq [r0+r6*2+16], m0 +%endif + add r6, 16 + cmp r6, t1 + jl .pad + add r0, r1mp + add r2, r3mp + add r4, r5 + dec t0d + jg .loopy + sfence + emms + RET + +;----------------------------------------------------------------------------- +; void x264_store_interleave_8x8x2_mmxext( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv ) +;----------------------------------------------------------------------------- +cglobal x264_store_interleave_8x8x2_%1, 4,5 + mov r4d, 4 +.loop: + INTERLEAVE r0, r2, r3, 1 + INTERLEAVE r0+r1, r2+FDEC_STRIDE, r3+FDEC_STRIDE, 1 + add r2, FDEC_STRIDE*2 + add r3, FDEC_STRIDE*2 + lea r0, [r0+r1*2] + dec r4d + jg .loop + REP_RET +%endmacro ; PLANE_INTERLEAVE + +%macro PLANE_DEINTERLEAVE 1 +;----------------------------------------------------------------------------- +; void x264_plane_copy_deinterleave_mmx( uint8_t *dstu, int i_dstu, +; uint8_t *dstv, int i_dstv, +; uint8_t *src, int i_src, int w, int h ) +;----------------------------------------------------------------------------- +cglobal x264_plane_copy_deinterleave_%1, 6,7 +%ifidn %1, ssse3 + mova m2, [deinterleave_shuf] +%endif + mov r6d, r6m + movsxdifnidn r1, r1d + movsxdifnidn r3, r3d + movsxdifnidn r5, r5d + add r0, r6 + add r2, r6 + lea r4, [r4+r6*2] +.loopy: + mov r6d, r6m + neg r6 +.loopx: + DEINTERLEAVE r0+r6, r2+r6, r4+r6*2, 0, %1, m2 + DEINTERLEAVE r0+r6+8, r2+r6+8, r4+r6*2+16, 0, %1, m2 + add r6, 16 + jl .loopx + add r0, r1 + add r2, r3 + add r4, r5 + dec dword r7m + jg .loopy + REP_RET + +;----------------------------------------------------------------------------- +; void x264_load_deinterleave_8x8x2_mmx( uint8_t *dst, uint8_t *src, int i_src ) +;----------------------------------------------------------------------------- +cglobal x264_load_deinterleave_8x8x2_%1, 3,4 +%ifidn %1, ssse3 + mova m2, [deinterleave_shuf] +%endif + mov r3d, 4 +.loop: + DEINTERLEAVE r0, r0+8, r1, 1, %1, m2 + DEINTERLEAVE r0+FENC_STRIDE, r0+FENC_STRIDE+8, r1+r2, 1, %1, m2 + add r0, FENC_STRIDE*2 + lea r1, [r1+r2*2] + dec r3d + jg .loop + REP_RET + +;----------------------------------------------------------------------------- +; void x264_load_deinterleave_9x1x2_mmx( uint8_t *dstu, uint8_t *dstv, uint8_t *src ) +;----------------------------------------------------------------------------- +cglobal x264_load_deinterleave_9x1x2_%1, 3,3 + DEINTERLEAVE r0+1, r1+1, r2+2, 0, %1, [deinterleave_shuf] + movzx r2d, word [r2] + mov [r0], r2b + shr r2d, 8 + mov [r1], r2b + RET +%endmacro ; PLANE_DEINTERLEAVE + +INIT_MMX +PLANE_INTERLEAVE mmxext +PLANE_DEINTERLEAVE mmx +INIT_XMM +PLANE_INTERLEAVE sse2 +PLANE_DEINTERLEAVE sse2 +PLANE_DEINTERLEAVE ssse3 + ; These functions are not general-use; not only do the SSE ones require aligned input, ; but they also will fail if given a non-mod16 size or a size less than 64. diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index fd04392..c382320 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -76,20 +76,34 @@ extern void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int ); extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int ); extern void x264_prefetch_ref_mmxext( uint8_t *, int, int ); -extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); -extern void x264_mc_chroma_sse2( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); -extern void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); -extern void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); extern void x264_plane_copy_core_mmxext( uint8_t *, int, uint8_t *, int, int w, int h); -extern void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h); +extern void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h ); +extern void x264_plane_copy_interleave_core_mmxext( uint8_t *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ); +extern void x264_plane_copy_interleave_core_sse2( uint8_t *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ); +extern void x264_plane_copy_interleave_c( uint8_t *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ); +extern void x264_plane_copy_deinterleave_mmx( uint8_t *dstu, int i_dstu, + uint8_t *dstv, int i_dstv, + uint8_t *src, int i_src, int w, int h ); +extern void x264_plane_copy_deinterleave_sse2( uint8_t *dstu, int i_dstu, + uint8_t *dstv, int i_dstv, + uint8_t *src, int i_src, int w, int h ); +extern void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, int i_dstu, + uint8_t *dstv, int i_dstv, + uint8_t *src, int i_src, int w, int h ); +extern void x264_store_interleave_8x8x2_mmxext( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv ); +extern void x264_store_interleave_8x8x2_sse2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv ); +extern void x264_load_deinterleave_8x8x2_mmx( uint8_t *dst, uint8_t *src, int i_src ); +extern void x264_load_deinterleave_8x8x2_sse2( uint8_t *dst, uint8_t *src, int i_src ); +extern void x264_load_deinterleave_8x8x2_ssse3( uint8_t *dst, uint8_t *src, int i_src ); +extern void x264_load_deinterleave_9x1x2_mmx( uint8_t *dstu, uint8_t *dstv, uint8_t *src ); +extern void x264_load_deinterleave_9x1x2_sse2( uint8_t *dstu, uint8_t *dstv, uint8_t *src ); +extern void x264_load_deinterleave_9x1x2_ssse3( uint8_t *dstu, uint8_t *dstv, uint8_t *src ); extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n ); extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n ); extern void x264_memzero_aligned_mmx( void * dst, int n ); @@ -103,6 +117,17 @@ extern void x264_integral_init8v_sse2( uint16_t *sum8, int stride ); extern void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride ); extern void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, int len ); + +#define MC_CHROMA(cpu) \ +extern void x264_mc_chroma_##cpu( uint8_t *dstu, uint8_t *dstv, int i_dst,\ + uint8_t *src, int i_src,\ + int dx, int dy, int i_width, int i_height ); +MC_CHROMA(mmxext) +MC_CHROMA(sse2) +MC_CHROMA(sse2_misalign) +MC_CHROMA(ssse3) +MC_CHROMA(ssse3_cache64) + #define LOWRES(cpu) \ extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\ int src_stride, int dst_stride, int width, int height ); @@ -343,11 +368,13 @@ HPEL(16, ssse3, ssse3, ssse3, ssse3) #endif HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2) -static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h) +static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h ) { if( w < 256 ) { // tiny resolutions don't want non-temporal hints. dunno the exact threshold. x264_plane_copy_c( dst, i_dst, src, i_src, w, h ); - } else if(i_src > 0) { + } else if( !(w&15) ) { + x264_plane_copy_core_mmxext( dst, i_dst, src, i_src, w, h ); + } else if( i_src > 0 ) { // have to use plain memcpy on the last line (in memory order) to avoid overreading src x264_plane_copy_core_mmxext( dst, i_dst, src, i_src, (w+15)&~15, h-1 ); memcpy( dst+i_dst*(h-1), src+i_src*(h-1), w ); @@ -357,6 +384,27 @@ static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i } } +#define PLANE_INTERLEAVE(cpu) \ +static void x264_plane_copy_interleave_##cpu( uint8_t *dst, int i_dst,\ + uint8_t *srcu, int i_srcu,\ + uint8_t *srcv, int i_srcv, int w, int h )\ +{\ + if( !(w&15) ) {\ + x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ + } else if( w < 16 || (i_srcu ^ i_srcv) ) {\ + x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ + } else if( i_srcu > 0 ) {\ + x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+15)&~15, h-1 );\ + x264_plane_copy_interleave_c( dst+i_dst*(h-1), 0, srcu+i_srcu*(h-1), 0, srcv+i_srcv*(h-1), 0, w, 1 );\ + } else {\ + x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\ + x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+15)&~15, h-1 );\ + }\ +} + +PLANE_INTERLEAVE(mmxext) +PLANE_INTERLEAVE(sse2) + void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) { if( !(cpu&X264_CPU_MMX) ) @@ -392,7 +440,14 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmxext; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmxext; + pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmxext; + pf->load_deinterleave_8x8x2 = x264_load_deinterleave_8x8x2_mmx; + pf->load_deinterleave_9x1x2 = x264_load_deinterleave_9x1x2_mmx; + pf->plane_copy = x264_plane_copy_mmxext; + pf->plane_copy_interleave = x264_plane_copy_interleave_mmxext; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_mmx; + pf->hpel_filter = x264_hpel_filter_mmxext; pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext; @@ -441,10 +496,16 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) if( cpu&X264_CPU_SSE_MISALIGN ) pf->hpel_filter = x264_hpel_filter_sse2_misalign; pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2; - pf->mc_chroma = x264_mc_chroma_sse2; + if( !(cpu&X264_CPU_STACK_MOD4) ) + pf->mc_chroma = x264_mc_chroma_sse2; if( cpu&X264_CPU_SSE2_IS_FAST ) { + pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_sse2; // FIXME sse2fast? sse2medium? + pf->load_deinterleave_8x8x2 = x264_load_deinterleave_8x8x2_sse2; + pf->load_deinterleave_9x1x2 = x264_load_deinterleave_9x1x2_sse2; + pf->plane_copy_interleave = x264_plane_copy_interleave_sse2; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2; pf->mc_luma = mc_luma_sse2; pf->get_ref = get_ref_sse2; if( cpu&X264_CPU_CACHELINE_64 ) @@ -453,7 +514,10 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->get_ref = get_ref_cache64_sse2; } if( cpu&X264_CPU_SSE_MISALIGN ) + { pf->get_ref = get_ref_sse2_misalign; + pf->mc_chroma = x264_mc_chroma_sse2_misalign; + } } if( !(cpu&X264_CPU_SSSE3) ) @@ -468,12 +532,18 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3; + pf->load_deinterleave_8x8x2 = x264_load_deinterleave_8x8x2_ssse3; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3; + pf->hpel_filter = x264_hpel_filter_ssse3; pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; - pf->mc_chroma = x264_mc_chroma_ssse3; + if( !(cpu&X264_CPU_STACK_MOD4) ) + pf->mc_chroma = x264_mc_chroma_ssse3; + if( cpu&X264_CPU_CACHELINE_64 ) { - pf->mc_chroma = x264_mc_chroma_ssse3_cache64; + if( !(cpu&X264_CPU_STACK_MOD4) ) + pf->mc_chroma = x264_mc_chroma_ssse3_cache64; pf->mc_luma = mc_luma_cache64_ssse3; pf->get_ref = get_ref_cache64_ssse3; @@ -483,7 +553,10 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) } if( cpu&X264_CPU_SHUFFLE_IS_FAST ) + { + pf->load_deinterleave_9x1x2 = x264_load_deinterleave_9x1x2_ssse3; pf->integral_init4v = x264_integral_init4v_ssse3; + } if( !(cpu&X264_CPU_SSE4) ) return; diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index fa9d37a..2975300 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -43,6 +43,7 @@ hmul_8p: times 8 db 1 times 4 db 1, -1 mask_10: times 4 dw 0, -1 mask_1100: times 2 dd 0, -1 +deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 SECTION .text @@ -302,6 +303,55 @@ SSD 4, 4, ssse3 SSD 4, 8, ssse3 %assign function_align 16 +;----------------------------------------------------------------------------- +; uint64_t x264_pixel_ssd_nv12_core_mmxext( uint8_t *pixuv1, int stride1, uint8_t *pixuv2, int stride2, int width, int height ) +;----------------------------------------------------------------------------- +%macro SSD_NV12 1-2 0 +cglobal x264_pixel_ssd_nv12_core_%1, 6,7 + shl r4d, 1 + add r0, r4 + add r2, r4 + pxor m3, m3 + pxor m4, m4 + mova m5, [pw_00ff] +.loopy: + mov r6, r4 + neg r6 +.loopx: + mova m0, [r0+r6] + mova m1, [r2+r6] + psubusb m0, m1 + psubusb m1, [r0+r6] + por m0, m1 + mova m2, m0 + pand m0, m5 + psrlw m2, 8 + pmaddwd m0, m0 + pmaddwd m2, m2 + paddd m3, m0 + paddd m4, m2 + add r6, mmsize + jl .loopx + add r0, r1 + add r2, r3 + dec r5d + jg .loopy + HADDD m3, m0 + HADDD m4, m0 + movd eax, m3 + movd edx, m4 +%ifdef ARCH_X86_64 + shl rdx, 32 + add rax, rdx +%endif + RET +%endmacro ; SSD_NV12 + +INIT_MMX +SSD_NV12 mmxext +INIT_XMM +SSD_NV12 sse2 + ;============================================================================= ; variance ;============================================================================= diff --git a/common/x86/pixel.h b/common/x86/pixel.h index 9bba683..0339564 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -97,6 +97,10 @@ void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * ); void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * ); void x264_intra_sa8d_x3_8x8_core_ssse3 ( uint8_t *, int16_t [2][8], int * ); +uint64_t x264_pixel_ssd_nv12_core_mmxext( uint8_t *pixuv1, int stride1, + uint8_t *pixuv2, int stride2, int width, int height ); +uint64_t x264_pixel_ssd_nv12_core_sse2( uint8_t *pixuv1, int stride1, + uint8_t *pixuv2, int stride2, int width, int height ); void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1, const uint8_t *pix2, int stride2, int sums[2][4] ); void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index ee3eca9..6d98ae3 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -269,13 +269,21 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] ASSERT %2 >= %1 %assign regs_used %2 ASSERT regs_used <= 7 - %assign xmm_regs_used %3 - ASSERT xmm_regs_used <= 16 %if regs_used > 4 push r4 push r5 %assign stack_offset stack_offset+16 %endif + SPILL_XMM %3 + LOAD_IF_USED 4, %1 + LOAD_IF_USED 5, %1 + LOAD_IF_USED 6, %1 + DEFINE_ARGS %4 +%endmacro + +%macro SPILL_XMM 1 + %assign xmm_regs_used %1 + ASSERT xmm_regs_used <= 16 %if xmm_regs_used > 6 sub rsp, (xmm_regs_used-6)*16+16 %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16 @@ -285,10 +293,6 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i %endrep %endif - LOAD_IF_USED 4, %1 - LOAD_IF_USED 5, %1 - LOAD_IF_USED 6, %1 - DEFINE_ARGS %4 %endmacro %macro RESTORE_XMM_INTERNAL 1 @@ -480,7 +484,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %define mova movq %define movu movq %define movh movd - %define movnt movntq + %define movnta movntq %assign %%i 0 %rep 8 CAT_XDEFINE m, %%i, mm %+ %%i @@ -504,7 +508,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %define mova movdqa %define movu movdqu %define movh movq - %define movnt movntdq + %define movnta movntdq %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, xmm %+ %%i diff --git a/encoder/analyse.c b/encoder/analyse.c index 2ece9dc..1a72fba 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -1092,8 +1092,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \ - (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \ - (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \ + (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>1)*(m)->i_stride[1]]; \ (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->weight = weight_none; \ (m)->i_ref = ref; @@ -1474,11 +1473,11 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; x264_weight_t *weight = h->sh.weight[i_ref]; + // FIXME weight can be done on 4x4 blocks even if mc is smaller #define CHROMA4x4MC( width, height, me, x, y ) \ - h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \ + h->mc.mc_chroma( &pix1[x+y*16], &pix2[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \ if( weight[1].weightfn ) \ weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \ - h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \ if( weight[2].weightfn ) \ weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height ); diff --git a/encoder/cabac.c b/encoder/cabac.c index a0dcff2..427dba7 100644 --- a/encoder/cabac.c +++ b/encoder/cabac.c @@ -76,9 +76,9 @@ static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb ) if( h->sh.i_type == SLICE_TYPE_I ) { int ctx = 0; - if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left != I_4x4 ) + if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != I_4x4 ) ctx++; - if( (h->mb.i_neighbour & MB_TOP) && h->mb.i_mb_type_top != I_4x4 ) + if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != I_4x4 ) ctx++; x264_cabac_mb_type_intra( h, cb, i_mb_type, 3+ctx, 3+3, 3+4, 3+5, 3+6, 3+7 ); @@ -110,9 +110,9 @@ static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb ) else //if( h->sh.i_type == SLICE_TYPE_B ) { int ctx = 0; - if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left != B_SKIP && h->mb.i_mb_type_left != B_DIRECT ) + if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != B_SKIP && h->mb.i_mb_type_left != B_DIRECT ) ctx++; - if( (h->mb.i_neighbour & MB_TOP) && h->mb.i_mb_type_top != B_SKIP && h->mb.i_mb_type_top != B_DIRECT ) + if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != B_SKIP && h->mb.i_mb_type_top != B_DIRECT ) ctx++; if( i_mb_type == B_DIRECT ) @@ -277,8 +277,8 @@ static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb ) #if !RDO_SKIP_BS void x264_cabac_mb_skip( x264_t *h, int b_skip ) { - int ctx = ((h->mb.i_neighbour & MB_LEFT) && !IS_SKIP( h->mb.i_mb_type_left )) - + ((h->mb.i_neighbour & MB_TOP) && !IS_SKIP( h->mb.i_mb_type_top )) + int ctx = (h->mb.i_mb_type_left >= 0 && !IS_SKIP( h->mb.i_mb_type_left )) + + (h->mb.i_mb_type_top >= 0 && !IS_SKIP( h->mb.i_mb_type_top )) + (h->sh.i_type == SLICE_TYPE_P ? 11 : 24); x264_cabac_encode_decision( &h->cabac, ctx, b_skip ); } diff --git a/encoder/encoder.c b/encoder/encoder.c index 300041e..ef1453d 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -66,9 +66,15 @@ static void x264_frame_dump( x264_t *h ) return; /* Write the frame in display order */ fseek( f, (uint64_t)h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2, SEEK_SET ); - for( int i = 0; i < h->fdec->i_plane; i++ ) - for( int y = 0; y < h->param.i_height >> !!i; y++ ) - fwrite( &h->fdec->plane[i][y*h->fdec->i_stride[i]], 1, h->param.i_width >> !!i, f ); + for( int y = 0; y < h->param.i_height; y++ ) + fwrite( &h->fdec->plane[0][y*h->fdec->i_stride[0]], 1, h->param.i_width, f ); + int cw = h->param.i_width>>1; + int ch = h->param.i_height>>1; + uint8_t *planeu = x264_malloc( cw*ch*2 ); + uint8_t *planev = planeu + cw*ch; + h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch ); + fwrite( planeu, 1, cw*ch*2, f ); + x264_free( planeu ); fclose( f ); } @@ -467,7 +473,7 @@ static int x264_validate_parameters( x264_t *h ) h->param.analyse.i_trellis = 0; h->param.analyse.b_fast_pskip = 0; h->param.analyse.i_noise_reduction = 0; - h->param.analyse.b_psy = 0; + h->param.analyse.f_psy_rd = 0; h->param.i_bframe = 0; /* 8x8dct is not useful at all in CAVLC lossless */ if( !h->param.b_cabac ) @@ -1568,11 +1574,11 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y ) if( !b_end && !h->param.b_sliced_threads ) for( int j = 0; j <= h->sh.b_mbaff; j++ ) - for( int i = 0; i < 3; i++ ) + for( int i = 0; i < 2; i++ ) { memcpy( h->mb.intra_border_backup[j][i], - h->fdec->plane[i] + ((mb_y*16 >> !!i) + j - 1 - h->sh.b_mbaff) * h->fdec->i_stride[i], - h->sps->i_mb_width*16 >> !!i ); + h->fdec->plane[i] + ((mb_y*16 >> i) + j - 1 - h->sh.b_mbaff) * h->fdec->i_stride[i], + h->sps->i_mb_width*16 ); } if( b_deblock ) @@ -1596,12 +1602,19 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y ) max_y = b_end ? h->param.i_height : mb_y*16-8; if( h->param.analyse.b_psnr ) - for( int i = 0; i < 3; i++ ) - h->stat.frame.i_ssd[i] += - x264_pixel_ssd_wxh( &h->pixf, - h->fdec->plane[i] + (min_y>>!!i) * h->fdec->i_stride[i], h->fdec->i_stride[i], - h->fenc->plane[i] + (min_y>>!!i) * h->fenc->i_stride[i], h->fenc->i_stride[i], - h->param.i_width >> !!i, (max_y-min_y) >> !!i ); + { + uint64_t ssd_y = x264_pixel_ssd_wxh( &h->pixf, + h->fdec->plane[0] + min_y * h->fdec->i_stride[0], h->fdec->i_stride[0], + h->fenc->plane[0] + min_y * h->fenc->i_stride[0], h->fenc->i_stride[0], + h->param.i_width, max_y-min_y ); + uint64_t ssd_uv = x264_pixel_ssd_nv12( &h->pixf, + h->fdec->plane[1] + (min_y>>1) * h->fdec->i_stride[1], h->fdec->i_stride[1], + h->fenc->plane[1] + (min_y>>1) * h->fenc->i_stride[1], h->fenc->i_stride[1], + h->param.i_width>>1, (max_y-min_y)>>1 ); + h->stat.frame.i_ssd[0] += ssd_y; + h->stat.frame.i_ssd[1] += (uint32_t)ssd_uv; + h->stat.frame.i_ssd[2] += ssd_uv>>32; + } if( h->param.analyse.b_ssim ) { @@ -2565,7 +2578,7 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current, x264_log( h, X264_LOG_WARNING, "invalid DTS: PTS is less than DTS\n" ); pic_out->img.i_plane = h->fdec->i_plane; - for( int i = 0; i < 3; i++ ) + for( int i = 0; i < h->fdec->i_plane; i++ ) { pic_out->img.i_stride[i] = h->fdec->i_stride[i]; pic_out->img.plane[i] = h->fdec->plane[i]; diff --git a/encoder/macroblock.c b/encoder/macroblock.c index 50f939a..5a42761 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -486,7 +486,7 @@ static void x264_macroblock_encode_pskip( x264_t *h ) h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0], mvx, mvy, 16, 16, &h->sh.weight[0][0] ); - h->mc.mc_chroma( h->mb.pic.p_fdec[1], FDEC_STRIDE, + h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], mvx, mvy, 8, 8 ); @@ -494,11 +494,6 @@ static void x264_macroblock_encode_pskip( x264_t *h ) h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &h->sh.weight[0][1], 8 ); - - h->mc.mc_chroma( h->mb.pic.p_fdec[2], FDEC_STRIDE, - h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2], - mvx, mvy, 8, 8 ); - if( h->sh.weight[0][2].weightfn ) h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, @@ -945,22 +940,20 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir ) i_qp = h->mb.i_chroma_qp; thresh = (x264_lambda2_tab[i_qp] + 32) >> 6; + if( !b_bidir ) + h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE, + h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], + mvp[0], mvp[1], 8, 8 ); + for( int ch = 0; ch < 2; ch++ ) { uint8_t *p_src = h->mb.pic.p_fenc[1+ch]; uint8_t *p_dst = h->mb.pic.p_fdec[1+ch]; - if( !b_bidir ) - { - h->mc.mc_chroma( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, - h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch], - mvp[0], mvp[1], 8, 8 ); - - if( h->sh.weight[0][1+ch].weightfn ) - h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, - h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, - &h->sh.weight[0][1+ch], 8 ); - } + if( !b_bidir && h->sh.weight[0][1+ch].weightfn ) + h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, + h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, + &h->sh.weight[0][1+ch], 8 ); /* there is almost never a termination during chroma, but we can't avoid the check entirely */ /* so instead we check SSD and skip the actual check if the score is low enough. */ diff --git a/encoder/me.c b/encoder/me.c index 6788022..06c6d65 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -775,18 +775,17 @@ if( b_refine_qpel || (dir^1) != odir ) \ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ if( b_chroma_me && cost < bcost ) \ { \ - h->mc.mc_chroma( pix, 8, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw/2, bh/2 ); \ + h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw/2, bh/2 ); \ if( m->weight[1].weightfn ) \ - m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 8, pix, 8, \ + m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 16, pix, 16, \ &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \ - cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix, 8 ); \ + cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \ if( cost < bcost ) \ { \ - h->mc.mc_chroma( pix, 8, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw/2, bh/2 ); \ if( m->weight[2].weightfn ) \ - m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 8, pix, 8, \ + m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix+8, 16, pix+8, 16, \ &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \ - cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix, 8 ); \ + cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \ } \ } \ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, bdir, dir ); \ @@ -909,10 +908,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite stride[list][i] = bw;\ src[list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[list][i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none ); \ if( rd )\ - {\ - h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\ - h->mc.mc_chroma( pixv_buf[list][i], 8, m->p_fref[5], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\ - }\ + h->mc.mc_chroma( pixu_buf[list][i], pixv_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\ } #define SATD_THRESH 17/16 @@ -1094,10 +1090,7 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei uint64_t cost; \ M32( cache_mv ) = pack16to32_mask(mx,my); \ if( m->i_pixel <= PIXEL_8x8 )\ - {\ - h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\ - h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\ - }\ + h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\ cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \ } \ diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c index b51dbf7..58f1a21 100644 --- a/encoder/ratecontrol.c +++ b/encoder/ratecontrol.c @@ -205,20 +205,30 @@ static inline double qscale2bits( ratecontrol_entry_t *rce, double qscale ) + rce->misc_bits; } +static ALWAYS_INLINE uint32_t ac_energy_var( uint64_t sum_sqr, int shift ) +{ + uint32_t sum = (uint32_t)sum_sqr; + uint32_t sqr = sum_sqr >> 32; + return sqr - (sum * sum >> shift); +} + static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i ) { int w = i ? 8 : 16; - int shift = i ? 6 : 8; int stride = frame->i_stride[i]; int offset = h->mb.b_interlaced - ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride - : w * (mb_x + mb_y * stride); - int pix = i ? PIXEL_8x8 : PIXEL_16x16; + ? 16 * mb_x + w * (mb_y&~1) * stride + (mb_y&1) * stride + : 16 * mb_x + w * mb_y * stride; stride <<= h->mb.b_interlaced; - uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride ); - uint32_t sum = (uint32_t)res; - uint32_t sqr = res >> 32; - return sqr - (sum * sum >> shift); + if( i ) + { + ALIGNED_ARRAY_16( uint8_t, pix,[FENC_STRIDE*8] ); + h->mc.load_deinterleave_8x8x2( pix, frame->plane[1] + offset, stride ); + return ac_energy_var( h->pixf.var[PIXEL_8x8]( pix, FENC_STRIDE ), 6 ) + + ac_energy_var( h->pixf.var[PIXEL_8x8]( pix+FENC_STRIDE/2, FENC_STRIDE ), 6 ); + } + else + return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[0] + offset, stride ), 8 ); } // Find the total AC energy of the block in all planes. @@ -230,7 +240,6 @@ static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame * sure no reordering goes on. */ uint32_t var = ac_energy_plane( h, mb_x, mb_y, frame, 0 ); var += ac_energy_plane( h, mb_x, mb_y, frame, 1 ); - var += ac_energy_plane( h, mb_x, mb_y, frame, 2 ); x264_emms(); return var; } diff --git a/input/avs.c b/input/avs.c index 9e3aa55..59634f4 100644 --- a/input/avs.c +++ b/input/avs.c @@ -233,8 +233,6 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c return -1; } res = update_clip( h, &vi, tmp, res ); - info->interlaced = 1; - info->tff = avs_is_tff( vi ); } if( vi->width&1 || vi->height&1 ) { @@ -242,6 +240,14 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c vi->width, vi->height ); return -1; } + /* bff/tff flags in avisynth are not technically mutually exclusive, which can lead to both being set. + * avisynth's own functions enact mutual exclusion, but source filters are not guaranteed to do this. */ + int tff = avs_is_tff( vi ); + if( avs_is_bff( vi ) ^ tff ) + { + info->interlaced = 1; + info->tff = !!tff; + } /* always call ConvertToYV12 to convert non YV12 planar colorspaces to YV12 when user's AVS supports them, as all planar colorspaces are flagged as YV12. If it is already YV12 in this case, the call does nothing */ if( !avs_is_yv12( vi ) || avs_version >= AVS_INTERFACE_OTHER_PLANAR ) diff --git a/tools/checkasm.c b/tools/checkasm.c index 228b75f..6559b31 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -445,6 +445,25 @@ static int check_pixel( int cpu_ref, int cpu_new ) TEST_INTRA_MBCMP( intra_sad_x3_4x4 , predict_4x4 , sad [PIXEL_4x4] , 0 ); report( "intra sad_x3 :" ); + ok = 1; used_asm = 0; + if( pixel_asm.ssd_nv12_core != pixel_ref.ssd_nv12_core ) + { + used_asm = 1; + set_func_name( "ssd_nv12" ); + uint64_t res_c = pixel_c.ssd_nv12_core( buf1, 368, buf2, 368, 360, 8 ); + uint64_t res_a = pixel_asm.ssd_nv12_core( buf1, 368, buf2, 368, 360, 8 ); + if( res_c != res_a ) + { + ok = 0; + fprintf( stderr, "ssd_nv12: %u,%u != %u,%u\n", + (uint32_t)res_c, (uint32_t)(res_c>>32), + (uint32_t)res_a, (uint32_t)(res_a>>32) ); + } + call_c( pixel_c.ssd_nv12_core, buf1, 368, buf2, 368, 360, 8 ); + call_a( pixel_asm.ssd_nv12_core, buf1, 368, buf2, 368, 360, 8 ); + } + report( "ssd_nv12 :" ); + if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core || pixel_asm.ssim_end4 != pixel_ref.ssim_end4 ) { @@ -823,12 +842,15 @@ static int check_mc( int cpu_ref, int cpu_new ) used_asm = 1; \ memset( buf3, 0xCD, 1024 ); \ memset( buf4, 0xCD, 1024 ); \ - call_c( mc_c.mc_chroma, dst1, 16, src, 64, dx, dy, w, h ); \ - call_a( mc_a.mc_chroma, dst2, 16, src, 64, dx, dy, w, h ); \ + call_c( mc_c.mc_chroma, dst1, dst1+8, 16, src, 64, dx, dy, w, h ); \ + call_a( mc_a.mc_chroma, dst2, dst2+8, 16, src, 64, dx, dy, w, h ); \ /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */\ for( int j = 0; j < h; j++ ) \ - for( int i = w; i < 4; i++ ) \ + for( int i = w; i < 8; i++ ) \ + { \ + dst2[i+j*16+8] = dst1[i+j*16+8]; \ dst2[i+j*16] = dst1[i+j*16]; \ + } \ if( memcmp( buf3, buf4, 1024 ) ) \ { \ fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \ @@ -970,6 +992,123 @@ static int check_mc( int cpu_ref, int cpu_new ) } report( "mc offsetsub :" ); + ok = 1; used_asm = 0; + if( mc_a.store_interleave_8x8x2 != mc_ref.store_interleave_8x8x2 ) + { + set_func_name( "store_interleave_8x8x2" ); + used_asm = 1; + memset( buf3, 0, 64*8 ); + memset( buf4, 0, 64*8 ); + call_c( mc_c.store_interleave_8x8x2, buf3, 64, buf1, buf1+16 ); + call_a( mc_a.store_interleave_8x8x2, buf4, 64, buf1, buf1+16 ); + if( memcmp( buf3, buf4, 64*8 ) ) + ok = 0; + } + if( mc_a.load_deinterleave_8x8x2 != mc_ref.load_deinterleave_8x8x2 ) + { + set_func_name( "load_deinterleave_8x8x2" ); + used_asm = 1; + call_c( mc_c.load_deinterleave_8x8x2, buf3, buf1, 64 ); + call_a( mc_a.load_deinterleave_8x8x2, buf4, buf1, 64 ); + if( memcmp( buf3, buf4, FENC_STRIDE*8 ) ) + ok = 0; + } + if( mc_a.load_deinterleave_9x1x2 != mc_ref.load_deinterleave_9x1x2 ) + { + set_func_name( "load_deinterleave_9x1x2" ); + used_asm = 1; + memset( buf3, 0, 32 ); + memset( buf4, 0, 32 ); + call_c( mc_c.load_deinterleave_9x1x2, buf3+7, buf3+16+7, buf1+14 ); + call_a( mc_a.load_deinterleave_9x1x2, buf4+7, buf4+16+7, buf1+14 ); + if( memcmp( buf3, buf4, 32 ) ) + ok = 0; + } + report( "store_interleave :" ); + + struct plane_spec { + int w, h, src_stride; + } plane_specs[] = { {2,2,2}, {8,6,8}, {20,31,24}, {32,8,40}, {256,10,272}, {504,7,505}, {528,6,528}, {256,10,-256}, {263,9,-264}, {1904,1,0} }; + ok = 1; used_asm = 0; + if( mc_a.plane_copy != mc_ref.plane_copy ) + { + set_func_name( "plane_copy" ); + used_asm = 1; + for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ ) + { + int w = plane_specs[i].w; + int h = plane_specs[i].h; + int src_stride = plane_specs[i].src_stride; + int dst_stride = (w + 127) & ~63; + assert( dst_stride * h <= 0x1000 ); + uint8_t *src = buf1 + X264_MAX(0, -src_stride) * (h-1); + memset( buf3, 0, 0x1000 ); + memset( buf4, 0, 0x1000 ); + call_c( mc_c.plane_copy, buf3, dst_stride, src, src_stride, w, h ); + call_a( mc_a.plane_copy, buf4, dst_stride, src, src_stride, w, h ); + for( int y = 0; y < h; y++ ) + if( memcmp( buf3+y*dst_stride, buf4+y*dst_stride, w ) ) + { + ok = 0; + fprintf( stderr, "plane_copy FAILED: w=%d h=%d stride=%d\n", w, h, src_stride ); + break; + } + } + } + + if( mc_a.plane_copy_interleave != mc_ref.plane_copy_interleave ) + { + set_func_name( "plane_copy_interleave" ); + used_asm = 1; + for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ ) + { + int w = (plane_specs[i].w + 1) >> 1; + int h = plane_specs[i].h; + int src_stride = (plane_specs[i].src_stride + 1) >> 1; + int dst_stride = (2*w + 127) & ~63; + assert( dst_stride * h <= 0x1000 ); + uint8_t *src = buf1 + X264_MAX(0, -src_stride) * (h-1); + memset( buf3, 0, 0x1000 ); + memset( buf4, 0, 0x1000 ); + call_c( mc_c.plane_copy_interleave, buf3, dst_stride, src, src_stride, src+1024, src_stride+16, w, h ); + call_a( mc_a.plane_copy_interleave, buf4, dst_stride, src, src_stride, src+1024, src_stride+16, w, h ); + for( int y = 0; y < h; y++ ) + if( memcmp( buf3+y*dst_stride, buf4+y*dst_stride, 2*w ) ) + { + ok = 0; + fprintf( stderr, "plane_copy_interleave FAILED: w=%d h=%d stride=%d\n", w, h, src_stride ); + break; + } + } + } + + if( mc_a.plane_copy_deinterleave != mc_ref.plane_copy_deinterleave ) + { + set_func_name( "plane_copy_deinterleave" ); + used_asm = 1; + for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ ) + { + int w = (plane_specs[i].w + 1) >> 1; + int h = plane_specs[i].h; + int dst_stride = w; + int src_stride = (2*w + 127) & ~63; + int offv = (dst_stride*h + 31) & ~15; + memset( buf3, 0, 0x1000 ); + memset( buf4, 0, 0x1000 ); + call_c( mc_c.plane_copy_deinterleave, buf3, dst_stride, buf3+offv, dst_stride, buf1, src_stride, w, h ); + call_a( mc_a.plane_copy_deinterleave, buf4, dst_stride, buf4+offv, dst_stride, buf1, src_stride, w, h ); + for( int y = 0; y < h; y++ ) + if( memcmp( buf3+y*dst_stride, buf4+y*dst_stride, w ) || + memcmp( buf3+y*dst_stride+offv, buf4+y*dst_stride+offv, w ) ) + { + ok = 0; + fprintf( stderr, "plane_copy_deinterleave FAILED: w=%d h=%d stride=%d\n", w, h, src_stride ); + break; + } + } + } + report( "plane_copy :" ); + if( mc_a.hpel_filter != mc_ref.hpel_filter ) { uint8_t *srchpel = buf1+8+2*64; @@ -1101,13 +1240,13 @@ static int check_deblock( int cpu_ref, int cpu_new ) x264_deblock_init( cpu_new, &db_a ); /* not exactly the real values of a,b,tc but close enough */ - for( int i = 35, a = 255, c = 250; i >= 0; i-- ) + for( int i = 35; i >= 0; i-- ) { + int a = 255, c = 250; alphas[i] = a; betas[i] = (i+1)/2; - tcs[i][0] = tcs[i][3] = (c+6)/10; - tcs[i][1] = (c+7)/15; - tcs[i][2] = (c+9)/20; + tcs[i][0] = tcs[i][2] = (c+6)/10; + tcs[i][1] = tcs[i][3] = (c+9)/20; a = a*9/10; c = c*9/10; } diff --git a/x264.c b/x264.c index 3f46fd9..add23b3 100644 --- a/x264.c +++ b/x264.c @@ -356,9 +356,8 @@ static void Help( x264_param_t *defaults, int longhelp ) " stillimage,psnr,ssim\n" " - other tunings: fastdecode,zerolatency\n" ); H2( " --slow-firstpass Don't force these faster settings with --pass 1:\n" - " --no-8x8dct --me dia --partitions none\n" - " --ref 1 --subme {2 if >2 else unchanged}\n" - " --trellis 0 --fast-pskip\n" ); + " --no-8x8dct --me dia --partitions none --ref 1\n" + " --subme {2 if >2 else unchanged} --trellis 0\n" ); else H1( " --slow-firstpass Don't force faster settings with --pass 1\n" ); H0( "\n" ); H0( "Frame-type options:\n" );