diff --git a/common/frame.c b/common/frame.c index abcfd14..5ce29bd 100644 --- a/common/frame.c +++ b/common/frame.c @@ -44,21 +44,14 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) i_stride = ALIGN( i_width + 2*PADH, align ); i_lines = ALIGN( h->param.i_height, 16<param.b_interlaced ); - frame->i_plane = 3; - for( int i = 0; i < 3; i++ ) + frame->i_plane = 2; + for( int i = 0; i < 2; i++ ) { - frame->i_stride[i] = ALIGN( i_stride >> !!i, align ); + frame->i_stride[i] = ALIGN( i_stride, align ); frame->i_width[i] = i_width >> !!i; frame->i_lines[i] = i_lines >> !!i; } - - luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv)); - chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv)); - for( int i = 1; i < 3; i++ ) - { - CHECKED_MALLOC( frame->buffer[i], chroma_plane_size ); - frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2; - } + frame->i_stride[2] = frame->i_stride[1]; // FIXME remove for( int i = 0; i < h->param.i_bframe + 2; i++ ) for( int j = 0; j < h->param.i_bframe + 2; j++ ) @@ -83,6 +76,12 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) frame->orig = frame; + luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv)); + chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv)); // FIXME 1*i_padv ? + + CHECKED_MALLOC( frame->buffer[1], chroma_plane_size ); + frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * i_padv/2 + PADH; + /* all 4 luma planes allocated together, since the cacheline split code * requires them to be in-phase wrt cacheline alignment. */ if( h->param.analyse.i_subpel_refine && b_fdec ) @@ -235,34 +234,49 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src ) dst->param = src->param; dst->i_pic_struct = src->i_pic_struct; + uint8_t *plane[3]; + int stride[3]; for( int i = 0; i < 3; i++ ) { int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i; - uint8_t *plane = src->img.plane[s]; - int stride = src->img.i_stride[s]; - int width = h->param.i_width >> !!i; int height = h->param.i_height >> !!i; + plane[i] = src->img.plane[s]; + stride[i] = src->img.i_stride[s]; if( src->img.i_csp & X264_CSP_VFLIP ) { - plane += (height-1)*stride; - stride = -stride; + plane[i] += (height-1)*stride[i]; + stride[i] = -stride[i]; } - h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height ); } + + h->mc.plane_copy( dst->plane[0], dst->i_stride[0], plane[0], stride[0], + h->param.i_width, h->param.i_height ); + h->mc.plane_copy_interleave( dst->plane[1], dst->i_stride[1], plane[1], stride[1], plane[2], stride[2], + h->param.i_width>>1, h->param.i_height>>1 ); return 0; } -static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom ) +static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom, int b_chroma ) { #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride ) for( int y = 0; y < i_height; y++ ) { - /* left band */ - memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh ); - /* right band */ - memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh ); + if( b_chroma ) + { + for( int x = 0; x < i_padh; x+=2 ) + CP16( PPIXEL(-x, y), PPIXEL(0, y) ); + for( int x = 0; x < i_padh; x+=2 ) + CP16( PPIXEL(i_width+x, y), PPIXEL(i_width-2, y) ); + } + else + { + /* left band */ + memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh ); + /* right band */ + memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh ); + } } /* upper band */ if( b_pad_top ) @@ -283,9 +297,9 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e for( int i = 0; i < frame->i_plane; i++ ) { int stride = frame->i_stride[i]; - int width = 16*h->sps->i_mb_width >> !!i; + int width = 16*h->sps->i_mb_width; int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i; - int padh = PADH >> !!i; + int padh = PADH; int padv = PADV >> !!i; // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i); @@ -293,12 +307,12 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e height += 4 >> (!!i + h->sh.b_mbaff); if( h->sh.b_mbaff ) { - plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end ); - plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end ); + plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, i ); + plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, i ); } else { - plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end ); + plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, i ); } } } @@ -320,32 +334,32 @@ void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4; if( h->sh.b_mbaff ) { - plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end ); - plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end ); + plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, 0 ); + plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, 0 ); } else - plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end ); + plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, 0 ); } } void x264_frame_expand_border_lowres( x264_frame_t *frame ) { for( int i = 0; i < 4; i++ ) - plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 ); + plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1, 0 ); } void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame ) { for( int i = 0; i < frame->i_plane; i++ ) { - int i_subsample = i ? 1 : 0; - int i_width = h->param.i_width >> i_subsample; - int i_height = h->param.i_height >> i_subsample; - int i_padx = (h->sps->i_mb_width * 16 - h->param.i_width) >> i_subsample; - int i_pady = (h->sps->i_mb_height * 16 - h->param.i_height) >> i_subsample; + int i_width = h->param.i_width; + int i_height = h->param.i_height >> !!i; + int i_padx = (h->sps->i_mb_width * 16 - h->param.i_width); + int i_pady = (h->sps->i_mb_height * 16 - h->param.i_height) >> !!i; if( i_padx ) { + // FIXME NV12 chroma for( int y = 0; y < i_height; y++ ) memset( &frame->plane[i][y*frame->i_stride[i] + i_width], frame->plane[i][y*frame->i_stride[i] + i_width - 1], @@ -508,7 +522,8 @@ static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int pix += 2*ystride; continue; } - for( int d = 0; d < 2; d++ ) + for( int d = 0; d < 2; d++, pix += ystride-2 ) + for( int e = 0; e < 2; e++, pix++ ) { int p1 = pix[-2*xstride]; int p0 = pix[-1*xstride]; @@ -521,17 +536,16 @@ static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */ pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */ } - pix += ystride; } } } static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) { - deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 ); + deblock_chroma_c( pix, stride, 2, alpha, beta, tc0 ); } static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) { - deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 ); + deblock_chroma_c( pix, 2, stride, alpha, beta, tc0 ); } static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta ) @@ -586,9 +600,10 @@ static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int bet deblock_luma_intra_c( pix, 1, stride, alpha, beta ); } -static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta ) +static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int dir ) { - for( int d = 0; d < 8; d++ ) + for( int d = 0; d < (dir?16:8); d++, pix += ystride-2 ) + for( int e = 0; e < (dir?1:2); e++, pix++ ) { int p1 = pix[-2*xstride]; int p0 = pix[-1*xstride]; @@ -600,19 +615,18 @@ static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystrid pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */ pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */ } - pix += ystride; } } static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta ) { - deblock_chroma_intra_c( pix, stride, 1, alpha, beta ); + deblock_chroma_intra_c( pix, stride, 2, alpha, beta, 1 ); } static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta ) { - deblock_chroma_intra_c( pix, 1, stride, alpha, beta ); + deblock_chroma_intra_c( pix, 2, stride, alpha, beta, 0 ); } -static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter ) +static inline void deblock_edge( x264_t *h, uint8_t *pix, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter ) { int index_a = i_qp + h->sh.i_alpha_c0_offset; int alpha = alpha_table(index_a); @@ -627,12 +641,10 @@ static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_ tc[2] = tc0_table(index_a)[bS[2]] + b_chroma; tc[3] = tc0_table(index_a)[bS[3]] + b_chroma; - pf_inter( pix1, i_stride, alpha, beta, tc ); - if( b_chroma ) - pf_inter( pix2, i_stride, alpha, beta, tc ); + pf_inter( pix, i_stride, alpha, beta, tc ); } -static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra ) +static inline void deblock_edge_intra( x264_t *h, uint8_t *pix, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra ) { int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset); int beta = beta_table(i_qp + h->sh.i_beta_offset); @@ -640,9 +652,7 @@ static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, if( !alpha || !beta ) return; - pf_intra( pix1, i_stride, alpha, beta ); - if( b_chroma ) - pf_intra( pix2, i_stride, alpha, beta ); + pf_intra( pix, i_stride, alpha, beta ); } void x264_frame_deblock_row( x264_t *h, int mb_y ) @@ -672,13 +682,11 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) int i_qp = h->mb.qp[mb_xy]; int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4; uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x; - uint8_t *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x; - uint8_t *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x; + uint8_t *pixuv = h->fdec->plane[1] + 8*mb_y*strideuv + 16*mb_x; if( b_interlaced && (mb_y&1) ) { pixy -= 15*stridey; - pixu -= 7*strideuv; - pixv -= 7*strideuv; + pixuv -= 7*strideuv; } x264_prefetch_fenc( h, h->fdec, mb_x, mb_y ); @@ -693,14 +701,14 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) if( i_dir == 0 )\ {\ /* vertical edge */\ - deblock_edge##intra( h, pixy + 4*i_edge, NULL,\ + deblock_edge##intra( h, pixy + 4*i_edge,\ stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\ h->loopf.deblock_h_luma##intra );\ if( !(i_edge & 1) )\ {\ /* U/V planes */\ int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\ - deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\ + deblock_edge##intra( h, pixuv + 4*i_edge,\ stride2uv, bS, i_qpc, 1,\ h->loopf.deblock_h_chroma##intra );\ }\ @@ -708,14 +716,14 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) else\ {\ /* horizontal edge */\ - deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\ + deblock_edge##intra( h, pixy + 4*i_edge*stride2y,\ stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\ h->loopf.deblock_v_luma##intra );\ /* U/V planes */\ if( !(i_edge & 1) )\ {\ int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\ - deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\ + deblock_edge##intra( h, pixuv + 2*i_edge*stride2uv,\ stride2uv, bS, i_qpc, 1,\ h->loopf.deblock_v_chroma##intra );\ }\ @@ -831,21 +839,25 @@ void x264_frame_deblock( x264_t *h ) } #ifdef HAVE_MMX -void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); -void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); - void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_chroma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ); void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_v_chroma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ); #ifdef ARCH_X86 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); +// FIXME this wrapper has a significant cpu cost static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) { x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 ); @@ -885,22 +897,26 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) #ifdef HAVE_MMX if( cpu&X264_CPU_MMXEXT ) { - pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext; - pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext; - pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext; - pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext; #ifdef ARCH_X86 pf->deblock_v_luma = x264_deblock_v_luma_mmxext; pf->deblock_h_luma = x264_deblock_h_luma_mmxext; + pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext; + pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext; pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext; pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext; + pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext; + pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext; #endif if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) ) { pf->deblock_v_luma = x264_deblock_v_luma_sse2; pf->deblock_h_luma = x264_deblock_h_luma_sse2; + pf->deblock_v_chroma = x264_deblock_v_chroma_sse2; + pf->deblock_h_chroma = x264_deblock_h_chroma_sse2; pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2; pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2; + pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_sse2; + pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_sse2; } } #endif @@ -918,8 +934,8 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) { pf->deblock_v_luma = x264_deblock_v_luma_neon; pf->deblock_h_luma = x264_deblock_h_luma_neon; - pf->deblock_v_chroma = x264_deblock_v_chroma_neon; - pf->deblock_h_chroma = x264_deblock_h_chroma_neon; +// pf->deblock_v_chroma = x264_deblock_v_chroma_neon; +// pf->deblock_h_chroma = x264_deblock_h_chroma_neon; } #endif } diff --git a/common/macroblock.c b/common/macroblock.c index 6dd19f6..f764349 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -503,7 +503,8 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h if( h->mb.b_interlaced & i_ref ) mvy += (h->mb.i_mb_y & 1)*4 - 2; - h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, + h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], + &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1], mvx, mvy, 2*width, 2*height ); @@ -511,11 +512,6 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, &h->sh.weight[i_ref][1], height*2 ); - - h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, - h->mb.pic.p_fref[0][i_ref][5], h->mb.pic.i_stride[2], - mvx, mvy, 2*width, 2*height ); - if( h->sh.weight[i_ref][2].weightfn ) h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, @@ -536,13 +532,10 @@ static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int h if( h->mb.b_interlaced & i_ref ) mvy += (h->mb.i_mb_y & 1)*4 - 2; - h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, + h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], + &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1], mvx, mvy, 2*width, 2*height ); - - h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, - h->mb.pic.p_fref[1][i_ref][5], h->mb.pic.i_stride[2], - mvx, mvy, 2*width, 2*height ); } static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height ) @@ -573,16 +566,12 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int if( h->mb.b_interlaced & i_ref1 ) mvy1 += (h->mb.i_mb_y & 1)*4 - 2; - h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1], + h->mc.mc_chroma( tmp0, tmp0+8, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1], mvx0, mvy0, 2*width, 2*height ); - h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1], + h->mc.mc_chroma( tmp1, tmp1+8, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1], mvx1, mvy1, 2*width, 2*height ); h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); - h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][5], h->mb.pic.i_stride[2], - mvx0, mvy0, 2*width, 2*height ); - h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][5], h->mb.pic.i_stride[2], - mvx1, mvy1, 2*width, 2*height ); - h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); + h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight ); } void x264_mb_mc_8x8( x264_t *h, int i8 ) @@ -756,11 +745,11 @@ int x264_macroblock_cache_init( x264_t *h ) } for( int i = 0; i <= h->param.b_interlaced; i++ ) - for( int j = 0; j < 3; j++ ) + for( int j = 0; j < 2; j++ ) { /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */ - CHECKED_MALLOCZERO( h->mb.intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j ); - h->mb.intra_border_backup[i][j] += 8; + CHECKED_MALLOCZERO( h->mb.intra_border_backup[i][j], h->sps->i_mb_width*16+32 ); + h->mb.intra_border_backup[i][j] += 16; } return 0; @@ -769,7 +758,7 @@ fail: return -1; void x264_macroblock_cache_end( x264_t *h ) { for( int i = 0; i <= h->param.b_interlaced; i++ ) - for( int j = 0; j < 3; j++ ) + for( int j = 0; j < 2; j++ ) x264_free( h->mb.intra_border_backup[i][j] - 8 ); for( int i = 0; i < 2; i++ ) for( int j = 0; j < 32; j++ ) @@ -895,29 +884,53 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x const int i_stride = h->fdec->i_stride[!!i]; const int i_stride2 = i_stride << h->mb.b_interlaced; const int i_pix_offset = h->mb.b_interlaced - ? w * (mb_x + (mb_y&~1) * i_stride) + (mb_y&1) * i_stride - : w * (mb_x + mb_y * i_stride); + ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride + : 16 * mb_x + w * mb_y * i_stride; const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset]; const uint8_t *intra_fdec = h->param.b_sliced_threads ? plane_fdec-i_stride2 : - &h->mb.intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16>>!!i]; + &h->mb.intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16]; int ref_pix_offset[2] = { i_pix_offset, i_pix_offset }; x264_frame_t **fref[2] = { h->fref0, h->fref1 }; if( h->mb.b_interlaced ) ref_pix_offset[1] += (1-2*(mb_y&1)) * i_stride; h->mb.pic.i_stride[i] = i_stride2; + if(i) + h->mb.pic.i_stride[2] = i_stride2; // FIXME remove h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset]; - h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE, - h->mb.pic.p_fenc_plane[i], i_stride2, w ); - if( mb_y > 0 ) - memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 ); + if( i ) + { + h->mc.load_deinterleave_8x8x2( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2 ); + if( mb_y > 0 ) + h->mc.load_deinterleave_9x1x2( &h->mb.pic.p_fdec[1][-1-FDEC_STRIDE], &h->mb.pic.p_fdec[2][-1-FDEC_STRIDE], intra_fdec-2 ); + else + { + memset( &h->mb.pic.p_fdec[1][-1-FDEC_STRIDE], 0, 9 ); + memset( &h->mb.pic.p_fdec[2][-1-FDEC_STRIDE], 0, 9 ); + } + } else - memset( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], 0, w*3/2+1 ); + { + h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fenc_plane[0], i_stride2, 16 ); + if( mb_y > 0 ) + memcpy( &h->mb.pic.p_fdec[0][-1-FDEC_STRIDE], intra_fdec-1, 25 ); + else + memset( &h->mb.pic.p_fdec[0][-1-FDEC_STRIDE], 0, 25 ); + } + // FIXME simplify if( h->mb.b_interlaced ) for( int j = 0; j < w; j++ ) - h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; + if( i ) + { + h->mb.pic.p_fdec[1][-1+j*FDEC_STRIDE] = plane_fdec[-2+j*i_stride2]; + h->mb.pic.p_fdec[2][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; + } + else + h->mb.pic.p_fdec[0][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; for( int j = 0; j < h->mb.pic.i_fref[0]; j++ ) { - h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &fref[0][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]]; + h->mb.pic.p_fref[0][j][i?4:0] = &fref[0][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]]; + if(i) + h->mb.pic.p_fref[0][j][5] = h->mb.pic.p_fref[0][j][4]+1; if( i == 0 ) { for( int k = 1; k < 4; k++ ) @@ -931,7 +944,9 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x if( h->sh.i_type == SLICE_TYPE_B ) for( int j = 0; j < h->mb.pic.i_fref[1]; j++ ) { - h->mb.pic.p_fref[1][j][i==0 ? 0:i+3] = &fref[1][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]]; + h->mb.pic.p_fref[1][j][i?4:0] = &fref[1][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]]; + if(i) + h->mb.pic.p_fref[1][j][5] = h->mb.pic.p_fref[1][j][4]+1; if( i == 0 ) for( int k = 1; k < 4; k++ ) h->mb.pic.p_fref[1][j][k] = &fref[1][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]]; @@ -1123,7 +1138,6 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y ) /* load picture pointers */ x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 0 ); x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1 ); - x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 2 ); if( h->fdec->integral ) { @@ -1271,10 +1285,12 @@ static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i ) int i_stride = h->fdec->i_stride[!!i]; int i_stride2 = i_stride << h->mb.b_interlaced; int i_pix_offset = h->mb.b_interlaced - ? w * (h->mb.i_mb_x + (h->mb.i_mb_y&~1) * i_stride) + (h->mb.i_mb_y&1) * i_stride - : w * (h->mb.i_mb_x + h->mb.i_mb_y * i_stride); - h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( &h->fdec->plane[i][i_pix_offset], i_stride2, - h->mb.pic.p_fdec[i], FDEC_STRIDE, w ); + ? 16 * h->mb.i_mb_x + w * (h->mb.i_mb_y&~1) * i_stride + (h->mb.i_mb_y&1) * i_stride + : 16 * h->mb.i_mb_x + w * h->mb.i_mb_y * i_stride; + if( i ) + h->mc.store_interleave_8x8x2( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2] ); + else + h->mc.copy[PIXEL_16x16]( &h->fdec->plane[0][i_pix_offset], i_stride2, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 ); } void x264_macroblock_cache_save( x264_t *h ) @@ -1293,7 +1309,6 @@ void x264_macroblock_cache_save( x264_t *h ) x264_macroblock_store_pic( h, 0 ); x264_macroblock_store_pic( h, 1 ); - x264_macroblock_store_pic( h, 2 ); x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y ); diff --git a/common/mc.c b/common/mc.c index 859e5fc..e5a1957 100644 --- a/common/mc.c +++ b/common/mc.c @@ -260,7 +260,7 @@ static uint8_t *get_ref( uint8_t *dst, int *i_dst_stride, } /* full chroma mc (ie until 1/8 pixel)*/ -static void mc_chroma( uint8_t *dst, int i_dst_stride, +static void mc_chroma( uint8_t *dstu, uint8_t *dstv, int i_dst_stride, uint8_t *src, int i_src_stride, int mvx, int mvy, int i_width, int i_height ) @@ -274,14 +274,20 @@ static void mc_chroma( uint8_t *dst, int i_dst_stride, int cC = (8-d8x)*d8y; int cD = d8x *d8y; - src += (mvy >> 3) * i_src_stride + (mvx >> 3); + src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2; srcp = &src[i_src_stride]; for( int y = 0; y < i_height; y++ ) { for( int x = 0; x < i_width; x++ ) - dst[x] = ( cA*src[x] + cB*src[x+1] + cC*srcp[x] + cD*srcp[x+1] + 32 ) >> 6; - dst += i_dst_stride; + { + dstu[x] = ( cA*src[2*x] + cB*src[2*x+2] + + cC*srcp[2*x] + cD*srcp[2*x+2] + 32 ) >> 6; + dstv[x] = ( cA*src[2*x+1] + cB*src[2*x+3] + + cC*srcp[2*x+1] + cD*srcp[2*x+3] + 32 ) >> 6; + } + dstu += i_dst_stride; + dstv += i_dst_stride; src = srcp; srcp += i_src_stride; } @@ -297,7 +303,7 @@ MC_COPY( 8 ) MC_COPY( 4 ) void x264_plane_copy_c( uint8_t *dst, int i_dst, - uint8_t *src, int i_src, int w, int h) + uint8_t *src, int i_src, int w, int h ) { while( h-- ) { @@ -307,6 +313,45 @@ void x264_plane_copy_c( uint8_t *dst, int i_dst, } } +void x264_plane_copy_interleave_c( uint8_t *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ) +{ + for( int y=0; ycopy[PIXEL_8x8] = mc_copy_w8; pf->copy[PIXEL_4x4] = mc_copy_w4; + pf->store_interleave_8x8x2 = store_interleave_8x8x2; + pf->load_deinterleave_8x8x2 = load_deinterleave_8x8x2; + pf->load_deinterleave_9x1x2 = load_deinterleave_9x1x2; + pf->plane_copy = x264_plane_copy_c; + pf->plane_copy_interleave = x264_plane_copy_interleave_c; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c; + pf->hpel_filter = hpel_filter; pf->prefetch_fenc = prefetch_fenc_null; diff --git a/common/mc.h b/common/mc.h index 68bba48..c469b7e 100644 --- a/common/mc.h +++ b/common/mc.h @@ -68,7 +68,7 @@ typedef struct /* mc_chroma may write up to 2 bytes of garbage to the right of dst, * so it must be run from left to right. */ - void (*mc_chroma)(uint8_t *dst, int i_dst, uint8_t *src, int i_src, + void (*mc_chroma)(uint8_t *dstu, uint8_t *dstv, int i_dst, uint8_t *src, int i_src, int mvx, int mvy, int i_width, int i_height ); @@ -78,13 +78,24 @@ typedef struct void (*copy[7])( uint8_t *dst, int, uint8_t *src, int, int i_height ); void (*copy_16x16_unaligned)( uint8_t *dst, int, uint8_t *src, int, int i_height ); + void (*store_interleave_8x8x2)( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv ); + void (*load_deinterleave_8x8x2)( uint8_t *dst, uint8_t *src, int i_src ); + void (*load_deinterleave_9x1x2)( uint8_t *dstu, uint8_t *dstv, uint8_t *src ); + void (*plane_copy)( uint8_t *dst, int i_dst, - uint8_t *src, int i_src, int w, int h); + uint8_t *src, int i_src, int w, int h ); + void (*plane_copy_interleave)( uint8_t *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ); + void (*plane_copy_deinterleave)( uint8_t *dstu, int i_dstu, + uint8_t *dstv, int i_dstv, + uint8_t *src, int i_src, int w, int h ); void (*hpel_filter)( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int i_stride, int i_width, int i_height, int16_t *buf ); /* prefetch the next few macroblocks of fenc or fdec */ + // FIXME NV12 void (*prefetch_fenc)( uint8_t *pix_y, int stride_y, uint8_t *pix_uv, int stride_uv, int mb_x ); /* prefetch the next few macroblocks of a hpel reference frame */ diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index 00d0418..90270fe 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -21,6 +21,7 @@ ;***************************************************************************** %include "x86inc.asm" +%include "x86util.asm" SECTION_RODATA pb_00: times 16 db 0x00 @@ -35,74 +36,123 @@ SECTION .text [base], [base+stride], [base+stride*2], [base3], \ [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] -; in: 8 rows of 4 bytes in %1..%8 +%define PASS8ROWS(base, base3, stride, stride3, offset) \ + PASS8ROWS(base+offset, base3+offset, stride, stride3) + +; in: 8 rows of 4 bytes in %4..%11 ; out: 4 rows of 8 bytes in m0..m3 -%macro TRANSPOSE4x8_LOAD 8 - movd m0, %1 - movd m2, %2 - movd m1, %3 - movd m3, %4 - punpcklbw m0, m2 - punpcklbw m1, m3 - movq m2, m0 - punpcklwd m0, m1 - punpckhwd m2, m1 - - movd m4, %5 - movd m6, %6 - movd m5, %7 - movd m7, %8 - punpcklbw m4, m6 - punpcklbw m5, m7 - movq m6, m4 - punpcklwd m4, m5 - punpckhwd m6, m5 - - movq m1, m0 - movq m3, m2 - punpckldq m0, m4 - punpckhdq m1, m4 - punpckldq m2, m6 - punpckhdq m3, m6 +%macro TRANSPOSE4x8_LOAD 11 + movh m0, %4 + movh m2, %5 + movh m1, %6 + movh m3, %7 + punpckl%1 m0, m2 + punpckl%1 m1, m3 + mova m2, m0 + punpckl%2 m0, m1 + punpckh%2 m2, m1 + + movh m4, %8 + movh m6, %9 + movh m5, %10 + movh m7, %11 + punpckl%1 m4, m6 + punpckl%1 m5, m7 + mova m6, m4 + punpckl%2 m4, m5 + punpckh%2 m6, m5 + + mova m1, m0 + mova m3, m2 + punpckl%3 m0, m4 + punpckh%3 m1, m4 + punpckl%3 m2, m6 + punpckh%3 m3, m6 %endmacro ; in: 4 rows of 8 bytes in m0..m3 ; out: 8 rows of 4 bytes in %1..%8 -%macro TRANSPOSE8x4_STORE 8 - movq m4, m0 - movq m5, m1 - movq m6, m2 +%macro TRANSPOSE8x4B_STORE 8 + mova m4, m0 + mova m5, m1 + mova m6, m2 punpckhdq m4, m4 punpckhdq m5, m5 punpckhdq m6, m6 punpcklbw m0, m1 punpcklbw m2, m3 - movq m1, m0 + mova m1, m0 punpcklwd m0, m2 punpckhwd m1, m2 - movd %1, m0 + movh %1, m0 punpckhdq m0, m0 - movd %2, m0 - movd %3, m1 + movh %2, m0 + movh %3, m1 punpckhdq m1, m1 - movd %4, m1 + movh %4, m1 punpckhdq m3, m3 punpcklbw m4, m5 punpcklbw m6, m3 - movq m5, m4 + mova m5, m4 punpcklwd m4, m6 punpckhwd m5, m6 - movd %5, m4 + movh %5, m4 punpckhdq m4, m4 - movd %6, m4 - movd %7, m5 + movh %6, m4 + movh %7, m5 punpckhdq m5, m5 - movd %8, m5 + movh %8, m5 +%endmacro + +%macro TRANSPOSE4x8B_LOAD 8 + TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 +%endmacro + +%macro TRANSPOSE4x8W_LOAD 8 +%if mmsize==16 + TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8 +%else + SWAP 1, 4, 2, 3 + mova m0, [t5] + mova m1, [t5+r1] + mova m2, [t5+r1*2] + mova m3, [t5+t6] + TRANSPOSE4x4W 0, 1, 2, 3, 4 +%endif +%endmacro + +%macro TRANSPOSE8x2W_STORE 8 + mova m0, m1 + punpcklwd m1, m2 + punpckhwd m0, m2 +%if mmsize==8 + movd %1, m1 + movd %3, m0 + psrlq m1, 32 + psrlq m0, 32 + movd %2, m1 + movd %4, m0 +%else + movd %1, m1 + movd %5, m0 + psrldq m1, 4 + psrldq m0, 4 + movd %2, m1 + movd %6, m0 + psrldq m1, 4 + psrldq m0, 4 + movd %3, m1 + movd %7, m0 + psrldq m1, 4 + psrldq m0, 4 + movd %4, m1 + movd %8, m0 +%endif %endmacro -%macro SBUTTERFLY 4 +%macro SBUTTERFLY0 4 movq %4, %2 punpckl%1 %2, %3 punpckh%1 %4, %3 @@ -111,6 +161,7 @@ SECTION .text ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] %macro TRANSPOSE6x8_MEM 9 + RESET_MM_PERMUTATION movq m0, %1 movq m1, %2 movq m2, %3 @@ -118,30 +169,32 @@ SECTION .text movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY bw, m0, m1, m7 - SBUTTERFLY bw, m2, m3, m1 - SBUTTERFLY bw, m4, m5, m3 - movq [%9+0x10], m1 - SBUTTERFLY bw, m6, %8, m5 - SBUTTERFLY wd, m0, m2, m1 - SBUTTERFLY wd, m4, m6, m2 + SBUTTERFLY bw, 0, 1, 7 + SBUTTERFLY bw, 2, 3, 7 + SBUTTERFLY bw, 4, 5, 7 + movq [%9+0x10], m3 + SBUTTERFLY0 bw, m6, %8, m7 + SBUTTERFLY wd, 0, 2, 3 + SBUTTERFLY wd, 4, 6, 3 punpckhdq m0, m4 movq [%9+0x00], m0 - SBUTTERFLY wd, m7, [%9+0x10], m6 - SBUTTERFLY wd, m3, m5, m4 - SBUTTERFLY dq, m7, m3, m0 - SBUTTERFLY dq, m1, m2, m5 - punpckldq m6, m4 - movq [%9+0x10], m1 - movq [%9+0x20], m5 - movq [%9+0x30], m7 - movq [%9+0x40], m0 - movq [%9+0x50], m6 + SBUTTERFLY0 wd, m1, [%9+0x10], m3 + SBUTTERFLY wd, 5, 7, 0 + SBUTTERFLY dq, 1, 5, 0 + SBUTTERFLY dq, 2, 6, 0 + punpckldq m3, m7 + movq [%9+0x10], m2 + movq [%9+0x20], m6 + movq [%9+0x30], m1 + movq [%9+0x40], m5 + movq [%9+0x50], m3 + RESET_MM_PERMUTATION %endmacro ; in: 8 rows of 8 in %1..%8 ; out: 8 rows of 8 in %9..%16 %macro TRANSPOSE8x8_MEM 16 + RESET_MM_PERMUTATION movq m0, %1 movq m1, %2 movq m2, %3 @@ -149,29 +202,30 @@ SECTION .text movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY bw, m0, m1, m7 - SBUTTERFLY bw, m2, m3, m1 - SBUTTERFLY bw, m4, m5, m3 - SBUTTERFLY bw, m6, %8, m5 - movq %9, m3 - SBUTTERFLY wd, m0, m2, m3 - SBUTTERFLY wd, m4, m6, m2 - SBUTTERFLY wd, m7, m1, m6 - movq %11, m2 - movq m2, %9 - SBUTTERFLY wd, m2, m5, m1 - SBUTTERFLY dq, m0, m4, m5 - SBUTTERFLY dq, m7, m2, m4 + SBUTTERFLY bw, 0, 1, 7 + SBUTTERFLY bw, 2, 3, 7 + SBUTTERFLY bw, 4, 5, 7 + SBUTTERFLY0 bw, m6, %8, m7 + movq %9, m5 + SBUTTERFLY wd, 0, 2, 5 + SBUTTERFLY wd, 4, 6, 5 + SBUTTERFLY wd, 1, 3, 5 + movq %11, m6 + movq m6, %9 + SBUTTERFLY wd, 6, 7, 5 + SBUTTERFLY dq, 0, 4, 5 + SBUTTERFLY dq, 1, 6, 5 movq %9, m0 - movq %10, m5 - movq %13, m7 - movq %14, m4 - SBUTTERFLY dq, m3, %11, m0 - SBUTTERFLY dq, m6, m1, m5 - movq %11, m3 + movq %10, m4 + movq %13, m1 + movq %14, m6 + SBUTTERFLY0 dq, m2, %11, m0 + SBUTTERFLY dq, 3, 7, 4 + movq %11, m2 movq %12, m0 - movq %15, m6 - movq %16, m5 + movq %15, m3 + movq %16, m7 + RESET_MM_PERMUTATION %endmacro ; out: %4 = |%1-%2|>%3 @@ -360,7 +414,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7 movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) shl r10, 3 sub r6, r10 @@ -370,7 +424,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7 movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) %ifdef WIN64 add rsp, 0x98 @@ -479,7 +533,7 @@ cglobal x264_deblock_h_luma_%1, 0,5 movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) lea r0, [r0+r3*8] lea r1, [r1+r3*8] @@ -487,7 +541,7 @@ cglobal x264_deblock_h_luma_%1, 0,5 movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) ADD esp, pad RET @@ -762,117 +816,152 @@ DEBLOCK_LUMA_INTRA mmxext, v8 -INIT_MMX - %macro CHROMA_V_START 0 dec r2d ; alpha-1 dec r3d ; beta-1 mov t5, r0 sub t5, r1 sub t5, r1 +%if mmsize==8 + mov dword r0m, 2 +.skip_prologue: +%endif %endmacro %macro CHROMA_H_START 0 dec r2d dec r3d - sub r0, 2 + sub r0, 4 lea t6, [r1*3] mov t5, r0 add r0, t6 +%if mmsize==8 + mov dword r0m, 2 +.skip_prologue: +%endif +%endmacro + +%macro CHROMA_V_LOOP 1 +%if mmsize==8 + add r0, 8 + add t5, 8 +%if %1 + add r4, 2 +%endif + dec dword r0m + jg .skip_prologue +%endif +%endmacro + +%macro CHROMA_H_LOOP 1 +%if mmsize==8 + lea r0, [r0+r1*4] + lea t5, [t5+r1*4] +%if %1 + add r4, 2 +%endif + dec dword r0m + jg .skip_prologue +%endif %endmacro %define t5 r5 %define t6 r6 +%macro DEBLOCK_CHROMA 1 ;----------------------------------------------------------------------------- ; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_mmxext, 5,6 +cglobal x264_deblock_v_chroma_%1, 5,6 CHROMA_V_START - movq m0, [t5] - movq m1, [t5+r1] - movq m2, [r0] - movq m3, [r0+r1] - call chroma_inter_body_mmxext - movq [t5+r1], m1 - movq [r0], m2 + mova m0, [t5] + mova m1, [t5+r1] + mova m2, [r0] + mova m3, [r0+r1] + call chroma_inter_body_%1 + mova [t5+r1], m1 + mova [r0], m2 + CHROMA_V_LOOP 1 RET ;----------------------------------------------------------------------------- ; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_mmxext, 5,7 -%ifdef ARCH_X86_64 - %define buf0 [rsp-24] - %define buf1 [rsp-16] -%else - %define buf0 r0m - %define buf1 r2m -%endif +cglobal x264_deblock_h_chroma_%1, 5,7 CHROMA_H_START - TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) - movq buf0, m0 - movq buf1, m3 - call chroma_inter_body_mmxext - movq m0, buf0 - movq m3, buf1 - TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) + call chroma_inter_body_%1 + TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) + CHROMA_H_LOOP 1 RET ALIGN 16 -chroma_inter_body_mmxext: +RESET_MM_PERMUTATION +chroma_inter_body_%1: LOAD_MASK r2d, r3d movd m6, [r4] ; tc0 punpcklbw m6, m6 + punpcklbw m6, m6 pand m7, m6 DEBLOCK_P0_Q0 ret +%endmacro ; DEBLOCK_CHROMA +INIT_XMM +DEBLOCK_CHROMA sse2 +%ifndef ARCH_X86_64 +INIT_MMX +DEBLOCK_CHROMA mmxext +%endif ; in: %1=p0 %2=p1 %3=q1 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 %macro CHROMA_INTRA_P0 3 - movq m4, %1 + mova m4, %1 pxor m4, %3 pand m4, [pb_01] ; m4 = (p0^q1)&1 pavgb %1, %3 psubusb %1, m4 - pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) + pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) %endmacro %define t5 r4 %define t6 r5 +%macro DEBLOCK_CHROMA_INTRA 1 ;----------------------------------------------------------------------------- ; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_intra_mmxext, 4,5 +cglobal x264_deblock_v_chroma_intra_%1, 4,5 CHROMA_V_START - movq m0, [t5] - movq m1, [t5+r1] - movq m2, [r0] - movq m3, [r0+r1] - call chroma_intra_body_mmxext - movq [t5+r1], m1 - movq [r0], m2 + mova m0, [t5] + mova m1, [t5+r1] + mova m2, [r0] + mova m3, [r0+r1] + call chroma_intra_body_%1 + mova [t5+r1], m1 + mova [r0], m2 + CHROMA_V_LOOP 0 RET ;----------------------------------------------------------------------------- ; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_intra_mmxext, 4,6 +cglobal x264_deblock_h_chroma_intra_%1, 4,6 CHROMA_H_START - TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) - call chroma_intra_body_mmxext - TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) + call chroma_intra_body_%1 + TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) + CHROMA_H_LOOP 0 RET ALIGN 16 -chroma_intra_body_mmxext: +RESET_MM_PERMUTATION +chroma_intra_body_%1: LOAD_MASK r2d, r3d - movq m5, m1 - movq m6, m2 + mova m5, m1 + mova m6, m2 CHROMA_INTRA_P0 m1, m0, m3 CHROMA_INTRA_P0 m2, m3, m0 psubb m1, m5 @@ -882,3 +971,11 @@ chroma_intra_body_mmxext: paddb m1, m5 paddb m2, m6 ret +%endmacro ; DEBLOCK_CHROMA_INTRA + +INIT_XMM +DEBLOCK_CHROMA_INTRA sse2 +%ifndef ARCH_X86_64 +INIT_MMX +DEBLOCK_CHROMA_INTRA mmxext +%endif diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index 6dbb5fc..3709078 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -28,7 +28,21 @@ SECTION_RODATA 32 -ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0 +;ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0 +ch4_shuffle: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 +ch4_shuffle_adj: times 8 db 0 + times 8 db 2 + times 8 db 4 + times 8 db 6 + times 8 db 8 + times 8 db 10 + times 8 db 12 + times 8 db 14 +ch8u_shuffle: db 0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14 +ch8v_shuffle: db 1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15 +ch8_shuffle: db 0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16 +pb_15: times 16 db 15 +pb_16: times 16 db 16 pw_1: times 8 dw 1 pw_4: times 8 dw 4 pw_8: times 8 dw 8 @@ -959,6 +973,8 @@ cglobal x264_prefetch_ref_mmxext, 3,3 ; chroma MC ;============================================================================= +%if 0 ; OBSOLETE + %define t0 rax %ifdef ARCH_X86_64 %define t1 r10 @@ -1309,3 +1325,140 @@ INIT_XMM MC_CHROMA_SSSE3 , 8 MC_CHROMA_SSSE3 _cache64, 9 + +%endif ; OBSOLETE + + + +;============================================================================= +; chroma MC +;============================================================================= + +%ifdef ARCH_X86_64 + %define t0 r10 + %define t1 r11 +%else + %define t0 r0 + %define t1 r1 +%endif + +%macro MC_CHROMA_START 0 + movifnidn r3, r3mp + movifnidn r4d, r4m + movifnidn r5d, r5m + movifnidn r6d, r6m + mov t0d, r6d + mov t1d, r5d + sar t0d, 3 + sar t1d, 3 + imul t0d, r4d + lea t0d, [t0+t1*2] + movsxdifnidn t0, t0d + add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride +%endmacro + +;----------------------------------------------------------------------------- +; void x264_mc_chroma_mmxext( uint8_t *dstu, uint8_t *dstv, int dst_stride, +; uint8_t *src, int src_stride, +; int dx, int dy, +; int width, int height ) +;----------------------------------------------------------------------------- +%macro MC_CHROMA_SSSE3 0-1 +INIT_XMM +cglobal x264_mc_chroma_ssse3%1, 0,7,8 ; FIXME reduce reg count + MC_CHROMA_START + and r5d, 7 + and r6d, 7 + mov t0d, r5d + shl t0d, 8 + sub t0d, r5d + mov r5d, 8 + add t0d, 8 + sub r5d, r6d + imul r6d, t0d ; (x*255+8)*y + imul r5d, t0d ; (x*255+8)*(8-y) + movd m6, r6d + movd m7, r5d +%ifidn %1, _cache64 + mov t0d, r3d + and t0d, 7 +%ifdef PIC + lea t1, [ch4_shuffle_adj] + movddup m5, [t1 + t0*4] +%else + movddup m5, [ch4_shuffle_adj + t0*4] +%endif + paddb m5, [ch4_shuffle] + and r3, ~7 +%else + mova m5, [ch4_shuffle] ; FIXME slower than memory operand? +%endif + movifnidn r0, r0mp + movifnidn r1, r1mp + movifnidn r2d, r2m + SPLATW m6, m6 + SPLATW m7, m7 +%ifdef ARCH_X86_64 + mov r10, r0 + mov r11, r1 + mov r6, r3 +%else + mov r3m, r3 +%endif +.loopx: + mov r5d, r8m + movu m0, [r3] + pshufb m0, m5 +.loopy: + movu m1, [r3+r4] + pshufb m1, m5 + movu m3, [r3+r4*2] + pshufb m3, m5 + mova m2, m1 + mova m4, m3 + pmaddubsw m0, m7 + pmaddubsw m1, m6 + pmaddubsw m2, m7 + pmaddubsw m3, m6 + paddw m0, [pw_32] + paddw m2, [pw_32] + paddw m1, m0 + paddw m3, m2 + mova m0, m4 + psrlw m1, 6 + psrlw m3, 6 + packuswb m1, m3 + movhlps m3, m1 + movd [r0], m1 + movd [r0+r2], m3 + psrldq m1, 4 + psrldq m3, 4 + movd [r1], m1 + movd [r1+r2], m3 + lea r3, [r3+r4*2] + lea r0, [r0+r2*2] + lea r1, [r1+r2*2] + sub r5d, 2 + jg .loopy + sub dword r7m, 4 + jg .continue + REP_RET +.continue: +%ifdef ARCH_X86_64 + lea r3, [r6+8] + lea r0, [r10+4] + lea r1, [r11+4] +%else + mov r3, r3m + mov r0, r0m + mov r1, r1m + add r3, 8 + add r0, 4 + add r1, 4 +%endif + jmp .loopx +%endmacro + +MC_CHROMA_SSSE3 +MC_CHROMA_SSSE3 _cache64 + diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index 20ef5d7..77374f3 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -32,10 +32,12 @@ filt_mul20: times 16 db 20 filt_mul15: times 8 db 1, -5 filt_mul51: times 8 db -5, 1 hpel_shuf: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 pw_1: times 8 dw 1 pw_16: times 8 dw 16 pw_32: times 8 dw 32 +pw_00ff: times 8 dw 0xff pd_128: times 4 dd 128 SECTION .text @@ -692,6 +694,143 @@ cglobal x264_plane_copy_core_mmxext, 6,7 RET +%macro INTERLEAVE 3 + movq m0, [%2] + movq m1, [%3] +%if mmsize==16 + punpcklbw m0, m1 + mova [%1], m0 +%else + mova m2, m0 + punpcklbw m0, m1 + punpckhbw m2, m1 + mova [%1], m0 + mova [%1+8], m2 +%endif +%endmacro + +%macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, cpu, shuffle constant +%if mmsize==16 +%ifidn %5, ssse3 + mova m0, [%3] + pshufb m0, %6 +%else + mova m0, [%3] + mova m1, m0 + pand m0, [pw_00ff] + psrlw m1, 8 + packuswb m0, m1 +%endif +%if %4 + mova [%1], m0 +%else + movq [%1], m0 + movhps [%2], m0 +%endif +%else + mova m0, [%3] + mova m1, [%3+8] + mova m2, m0 + mova m3, m1 + pand m0, [pw_00ff] + pand m1, [pw_00ff] + psrlw m2, 8 + psrlw m3, 8 + packuswb m0, m1 + packuswb m2, m3 + mova [%1], m0 + mova [%2], m2 +%endif +%endmacro + +%macro PLANE_INTERLEAVE 1 +;----------------------------------------------------------------------------- +; void x264_plane_copy_interleave_core_mmxext( uint8_t *dst, int i_dst, +; uint8_t *srcu, int i_srcu, +; uint8_t *srcv, int i_srcv, int w, int h ) +;----------------------------------------------------------------------------- +; assumes i_dst and w are multiples of 16, and i_dst>2*w +cglobal x264_plane_copy_interleave_core_%1, 6,7 + mov r6d, r6m + movsxdifnidn r1, r1d + movsxdifnidn r3, r3d + movsxdifnidn r5, r5d + lea r0, [r0+r6*2] + add r2, r6 + add r4, r6 +.loopy: + mov r6d, r6m + neg r6 +.loopx: + ; FIXME prefetch + INTERLEAVE r0+r6*2, r2+r6, r4+r6 + INTERLEAVE r0+r6*2+16, r2+r6+8, r4+r6+8 + add r6, 16 + jl .loopx + add r0, r1 + add r2, r3 + add r4, r5 + dec dword r7m + jg .loopy + emms + RET + +;----------------------------------------------------------------------------- +; void x264_store_interleave_8x8x2_mmxext( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv ) +;----------------------------------------------------------------------------- +cglobal x264_store_interleave_8x8x2_%1, 4,5 + mov r4d, 4 +.loop: + ; FIXME exploit aligned sources + INTERLEAVE r0, r2, r3 + INTERLEAVE r0+r1, r2+FDEC_STRIDE, r3+FDEC_STRIDE + add r2, FDEC_STRIDE*2 + add r3, FDEC_STRIDE*2 + lea r0, [r0+r1*2] + dec r4d + jg .loop + REP_RET +%endmacro ; PLANE_INTERLEAVE + +%macro PLANE_DEINTERLEAVE 1 +;----------------------------------------------------------------------------- +; void x264_load_deinterleave_8x8x2_mmxext( uint8_t *dst, uint8_t *src, int i_src ) +;----------------------------------------------------------------------------- +cglobal x264_load_deinterleave_8x8x2_%1, 3,4 +%ifidn %1, ssse3 + mova m2, [deinterleave_shuf] +%endif + mov r3d, 4 +.loop: + ; FIXME mmx doesn't benefit from unrolling + DEINTERLEAVE r0, r0+8, r1, 1, %1, m2 + DEINTERLEAVE r0+FENC_STRIDE, r0+FENC_STRIDE+8, r1+r2, 1, %1, m2 + add r0, FENC_STRIDE*2 + lea r1, [r1+r2*2] + dec r3d + jg .loop + REP_RET + +;----------------------------------------------------------------------------- +; void x264_load_deinterleave_9x1x2_mmxext( uint8_t *dstu, uint8_t *dstv, uint8_t *src ) +;----------------------------------------------------------------------------- +cglobal x264_load_deinterleave_9x1x2_%1, 3,3 + DEINTERLEAVE r0+1, r1+1, r2+2, 0, %1, [deinterleave_shuf] + movzx r2d, word [r2] + mov [r0], r2b + shr r2d, 8 + mov [r1], r2b + RET +%endmacro ; PLANE_DEINTERLEAVE + +INIT_MMX +PLANE_INTERLEAVE mmxext +PLANE_DEINTERLEAVE mmxext +INIT_XMM +PLANE_INTERLEAVE sse2 +PLANE_DEINTERLEAVE sse2 +PLANE_DEINTERLEAVE ssse3 + ; These functions are not general-use; not only do the SSE ones require aligned input, ; but they also will fail if given a non-mod16 size or a size less than 64. diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index fd04392..c5d1bf7 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -76,20 +76,37 @@ extern void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int ); extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int ); extern void x264_prefetch_ref_mmxext( uint8_t *, int, int ); -extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, +extern void x264_mc_chroma_mmxext( uint8_t *dstu, uint8_t *dstv, int i_dst, + uint8_t *src, int i_src, int dx, int dy, int i_width, int i_height ); -extern void x264_mc_chroma_sse2( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, +extern void x264_mc_chroma_sse2( uint8_t *dstu, uint8_t *dstv, int i_dst, + uint8_t *src, int i_src, int dx, int dy, int i_width, int i_height ); -extern void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); -extern void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, +extern void x264_mc_chroma_ssse3( uint8_t *dstu, uint8_t *dstv, int i_dst, + uint8_t *src, int i_src, int dx, int dy, int i_width, int i_height ); +extern void x264_mc_chroma_ssse3_cache64( uint8_t *dstu, uint8_t *dstv, int i_dst, + uint8_t *src, int i_src, + int dx, int dy, int i_width, int i_height ); extern void x264_plane_copy_core_mmxext( uint8_t *, int, uint8_t *, int, int w, int h); -extern void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h); +extern void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h ); +extern void x264_plane_copy_interleave_core_mmxext( uint8_t *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ); +extern void x264_plane_copy_interleave_core_sse2( uint8_t *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ); +extern void x264_plane_copy_interleave_c( uint8_t *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ); +extern void x264_store_interleave_8x8x2_mmxext( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv ); +extern void x264_store_interleave_8x8x2_sse2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv ); +extern void x264_load_deinterleave_8x8x2_mmxext( uint8_t *dst, uint8_t *src, int i_src ); +extern void x264_load_deinterleave_8x8x2_sse2( uint8_t *dst, uint8_t *src, int i_src ); +extern void x264_load_deinterleave_8x8x2_ssse3( uint8_t *dst, uint8_t *src, int i_src ); +extern void x264_load_deinterleave_9x1x2_mmxext( uint8_t *dstu, uint8_t *dstv, uint8_t *src ); +extern void x264_load_deinterleave_9x1x2_sse2( uint8_t *dstu, uint8_t *dstv, uint8_t *src ); +extern void x264_load_deinterleave_9x1x2_ssse3( uint8_t *dstu, uint8_t *dstv, uint8_t *src ); extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n ); extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n ); extern void x264_memzero_aligned_mmx( void * dst, int n ); @@ -343,11 +360,13 @@ HPEL(16, ssse3, ssse3, ssse3, ssse3) #endif HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2) -static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h) +static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h ) { if( w < 256 ) { // tiny resolutions don't want non-temporal hints. dunno the exact threshold. x264_plane_copy_c( dst, i_dst, src, i_src, w, h ); - } else if(i_src > 0) { + } else if( !(w&15) ) { + x264_plane_copy_core_mmxext( dst, i_dst, src, i_src, w, h ); + } else if( i_src > 0 ) { // have to use plain memcpy on the last line (in memory order) to avoid overreading src x264_plane_copy_core_mmxext( dst, i_dst, src, i_src, (w+15)&~15, h-1 ); memcpy( dst+i_dst*(h-1), src+i_src*(h-1), w ); @@ -357,6 +376,27 @@ static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i } } +#define PLANE_INTERLEAVE(cpu) \ +static void x264_plane_copy_interleave_##cpu( uint8_t *dst, int i_dst,\ + uint8_t *srcu, int i_srcu,\ + uint8_t *srcv, int i_srcv, int w, int h )\ +{\ + if( !(w&15) ) {\ + x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ + } else if( w < 16 || (i_srcu ^ i_srcv) ) {\ + x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ + } else if( i_srcu > 0 ) {\ + x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+15)&~15, h-1 );\ + x264_plane_copy_interleave_c( dst+i_dst*(h-1), 0, srcu+i_srcu*(h-1), 0, srcv+i_srcv*(h-1), 0, w, 1 );\ + } else {\ + x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\ + x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+15)&~15, h-1 );\ + }\ +} + +PLANE_INTERLEAVE(mmxext) +PLANE_INTERLEAVE(sse2) + void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) { if( !(cpu&X264_CPU_MMX) ) @@ -376,7 +416,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->mc_luma = mc_luma_mmxext; pf->get_ref = get_ref_mmxext; - pf->mc_chroma = x264_mc_chroma_mmxext; +// pf->mc_chroma = x264_mc_chroma_mmxext; pf->weight = x264_mc_weight_wtab_mmxext; pf->offsetadd = x264_mc_offsetadd_wtab_mmxext; @@ -392,7 +432,13 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmxext; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmxext; + pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmxext; + pf->load_deinterleave_8x8x2 = x264_load_deinterleave_8x8x2_mmxext; + pf->load_deinterleave_9x1x2 = x264_load_deinterleave_9x1x2_mmxext; + pf->plane_copy = x264_plane_copy_mmxext; + pf->plane_copy_interleave = x264_plane_copy_interleave_mmxext; + pf->hpel_filter = x264_hpel_filter_mmxext; pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext; @@ -441,10 +487,15 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) if( cpu&X264_CPU_SSE_MISALIGN ) pf->hpel_filter = x264_hpel_filter_sse2_misalign; pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2; - pf->mc_chroma = x264_mc_chroma_sse2; +// pf->mc_chroma = x264_mc_chroma_sse2; + + pf->plane_copy_interleave = x264_plane_copy_interleave_sse2; // FIXME slow? fast? shuffle? if( cpu&X264_CPU_SSE2_IS_FAST ) { + pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_sse2; + pf->load_deinterleave_8x8x2 = x264_load_deinterleave_8x8x2_sse2; + pf->load_deinterleave_9x1x2 = x264_load_deinterleave_9x1x2_sse2; pf->mc_luma = mc_luma_sse2; pf->get_ref = get_ref_sse2; if( cpu&X264_CPU_CACHELINE_64 ) @@ -468,6 +519,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3; + pf->load_deinterleave_8x8x2 = x264_load_deinterleave_8x8x2_ssse3; + pf->hpel_filter = x264_hpel_filter_ssse3; pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; pf->mc_chroma = x264_mc_chroma_ssse3; @@ -483,7 +536,10 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) } if( cpu&X264_CPU_SHUFFLE_IS_FAST ) + { + pf->load_deinterleave_9x1x2 = x264_load_deinterleave_9x1x2_ssse3; pf->integral_init4v = x264_integral_init4v_ssse3; + } if( !(cpu&X264_CPU_SSE4) ) return; diff --git a/encoder/analyse.c b/encoder/analyse.c index 2ece9dc..2ebefa9 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -1092,8 +1092,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \ - (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \ - (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \ + (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>1)*(m)->i_stride[1]]; \ + (m)->p_fref[5] = &(src)[5][(xoff)+((yoff)>>1)*(m)->i_stride[1]]; \ (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->weight = weight_none; \ (m)->i_ref = ref; @@ -1474,11 +1474,11 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; x264_weight_t *weight = h->sh.weight[i_ref]; + // FIXME weight can be done on 4x4 blocks even if mc is smaller #define CHROMA4x4MC( width, height, me, x, y ) \ - h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \ + h->mc.mc_chroma( &pix1[x+y*16], &pix2[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \ if( weight[1].weightfn ) \ weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \ - h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \ if( weight[2].weightfn ) \ weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height ); diff --git a/encoder/encoder.c b/encoder/encoder.c index e9f297f..1347338 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -66,9 +66,15 @@ static void x264_frame_dump( x264_t *h ) return; /* Write the frame in display order */ fseek( f, (uint64_t)h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2, SEEK_SET ); - for( int i = 0; i < h->fdec->i_plane; i++ ) - for( int y = 0; y < h->param.i_height >> !!i; y++ ) - fwrite( &h->fdec->plane[i][y*h->fdec->i_stride[i]], 1, h->param.i_width >> !!i, f ); + for( int y = 0; y < h->param.i_height; y++ ) + fwrite( &h->fdec->plane[0][y*h->fdec->i_stride[0]], 1, h->param.i_width, f ); + int cw = h->param.i_width>>1; + int ch = h->param.i_height>>1; + uint8_t *planeu = x264_malloc( cw*ch*2 ); + uint8_t *planev = planeu + cw*ch; + h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch ); + fwrite( planeu, 1, cw*ch*2, f ); + x264_free( planeu ); fclose( f ); } @@ -1568,11 +1574,11 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y ) if( !b_end && !h->param.b_sliced_threads ) for( int j = 0; j <= h->sh.b_mbaff; j++ ) - for( int i = 0; i < 3; i++ ) + for( int i = 0; i < 2; i++ ) { memcpy( h->mb.intra_border_backup[j][i], h->fdec->plane[i] + ((mb_y*16 >> !!i) + j - 1 - h->sh.b_mbaff) * h->fdec->i_stride[i], - h->sps->i_mb_width*16 >> !!i ); + h->sps->i_mb_width*16 ); } if( b_deblock ) diff --git a/encoder/macroblock.c b/encoder/macroblock.c index 50f939a..5a42761 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -486,7 +486,7 @@ static void x264_macroblock_encode_pskip( x264_t *h ) h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0], mvx, mvy, 16, 16, &h->sh.weight[0][0] ); - h->mc.mc_chroma( h->mb.pic.p_fdec[1], FDEC_STRIDE, + h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], mvx, mvy, 8, 8 ); @@ -494,11 +494,6 @@ static void x264_macroblock_encode_pskip( x264_t *h ) h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &h->sh.weight[0][1], 8 ); - - h->mc.mc_chroma( h->mb.pic.p_fdec[2], FDEC_STRIDE, - h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2], - mvx, mvy, 8, 8 ); - if( h->sh.weight[0][2].weightfn ) h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, @@ -945,22 +940,20 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir ) i_qp = h->mb.i_chroma_qp; thresh = (x264_lambda2_tab[i_qp] + 32) >> 6; + if( !b_bidir ) + h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE, + h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], + mvp[0], mvp[1], 8, 8 ); + for( int ch = 0; ch < 2; ch++ ) { uint8_t *p_src = h->mb.pic.p_fenc[1+ch]; uint8_t *p_dst = h->mb.pic.p_fdec[1+ch]; - if( !b_bidir ) - { - h->mc.mc_chroma( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, - h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch], - mvp[0], mvp[1], 8, 8 ); - - if( h->sh.weight[0][1+ch].weightfn ) - h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, - h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, - &h->sh.weight[0][1+ch], 8 ); - } + if( !b_bidir && h->sh.weight[0][1+ch].weightfn ) + h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, + h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, + &h->sh.weight[0][1+ch], 8 ); /* there is almost never a termination during chroma, but we can't avoid the check entirely */ /* so instead we check SSD and skip the actual check if the score is low enough. */ diff --git a/encoder/me.c b/encoder/me.c index 6788022..06c6d65 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -775,18 +775,17 @@ if( b_refine_qpel || (dir^1) != odir ) \ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ if( b_chroma_me && cost < bcost ) \ { \ - h->mc.mc_chroma( pix, 8, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw/2, bh/2 ); \ + h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw/2, bh/2 ); \ if( m->weight[1].weightfn ) \ - m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 8, pix, 8, \ + m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 16, pix, 16, \ &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \ - cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix, 8 ); \ + cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \ if( cost < bcost ) \ { \ - h->mc.mc_chroma( pix, 8, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw/2, bh/2 ); \ if( m->weight[2].weightfn ) \ - m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 8, pix, 8, \ + m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix+8, 16, pix+8, 16, \ &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \ - cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix, 8 ); \ + cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \ } \ } \ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, bdir, dir ); \ @@ -909,10 +908,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite stride[list][i] = bw;\ src[list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[list][i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none ); \ if( rd )\ - {\ - h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\ - h->mc.mc_chroma( pixv_buf[list][i], 8, m->p_fref[5], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\ - }\ + h->mc.mc_chroma( pixu_buf[list][i], pixv_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\ } #define SATD_THRESH 17/16 @@ -1094,10 +1090,7 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei uint64_t cost; \ M32( cache_mv ) = pack16to32_mask(mx,my); \ if( m->i_pixel <= PIXEL_8x8 )\ - {\ - h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\ - h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\ - }\ + h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\ cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \ } \ diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c index b51dbf7..58f1a21 100644 --- a/encoder/ratecontrol.c +++ b/encoder/ratecontrol.c @@ -205,20 +205,30 @@ static inline double qscale2bits( ratecontrol_entry_t *rce, double qscale ) + rce->misc_bits; } +static ALWAYS_INLINE uint32_t ac_energy_var( uint64_t sum_sqr, int shift ) +{ + uint32_t sum = (uint32_t)sum_sqr; + uint32_t sqr = sum_sqr >> 32; + return sqr - (sum * sum >> shift); +} + static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i ) { int w = i ? 8 : 16; - int shift = i ? 6 : 8; int stride = frame->i_stride[i]; int offset = h->mb.b_interlaced - ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride - : w * (mb_x + mb_y * stride); - int pix = i ? PIXEL_8x8 : PIXEL_16x16; + ? 16 * mb_x + w * (mb_y&~1) * stride + (mb_y&1) * stride + : 16 * mb_x + w * mb_y * stride; stride <<= h->mb.b_interlaced; - uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride ); - uint32_t sum = (uint32_t)res; - uint32_t sqr = res >> 32; - return sqr - (sum * sum >> shift); + if( i ) + { + ALIGNED_ARRAY_16( uint8_t, pix,[FENC_STRIDE*8] ); + h->mc.load_deinterleave_8x8x2( pix, frame->plane[1] + offset, stride ); + return ac_energy_var( h->pixf.var[PIXEL_8x8]( pix, FENC_STRIDE ), 6 ) + + ac_energy_var( h->pixf.var[PIXEL_8x8]( pix+FENC_STRIDE/2, FENC_STRIDE ), 6 ); + } + else + return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[0] + offset, stride ), 8 ); } // Find the total AC energy of the block in all planes. @@ -230,7 +240,6 @@ static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame * sure no reordering goes on. */ uint32_t var = ac_energy_plane( h, mb_x, mb_y, frame, 0 ); var += ac_energy_plane( h, mb_x, mb_y, frame, 1 ); - var += ac_energy_plane( h, mb_x, mb_y, frame, 2 ); x264_emms(); return var; } diff --git a/tools/checkasm.c b/tools/checkasm.c index 5d0e98d..862c322 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -823,12 +823,15 @@ static int check_mc( int cpu_ref, int cpu_new ) used_asm = 1; \ memset( buf3, 0xCD, 1024 ); \ memset( buf4, 0xCD, 1024 ); \ - call_c( mc_c.mc_chroma, dst1, 16, src, 64, dx, dy, w, h ); \ - call_a( mc_a.mc_chroma, dst2, 16, src, 64, dx, dy, w, h ); \ + call_c( mc_c.mc_chroma, dst1, dst1+8, 16, src, 64, dx, dy, w, h ); \ + call_a( mc_a.mc_chroma, dst2, dst2+8, 16, src, 64, dx, dy, w, h ); \ /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */\ for( int j = 0; j < h; j++ ) \ - for( int i = w; i < 4; i++ ) \ + for( int i = w; i < 8; i++ ) \ + { \ + dst2[i+j*16+8] = dst1[i+j*16+8]; \ dst2[i+j*16] = dst1[i+j*16]; \ + } \ if( memcmp( buf3, buf4, 1024 ) ) \ { \ fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \ @@ -862,8 +865,8 @@ static int check_mc( int cpu_ref, int cpu_new ) MC_TEST_CHROMA( 4, 8 ); MC_TEST_CHROMA( 4, 4 ); MC_TEST_CHROMA( 4, 2 ); - MC_TEST_CHROMA( 2, 4 ); - MC_TEST_CHROMA( 2, 2 ); +// MC_TEST_CHROMA( 2, 4 ); +// MC_TEST_CHROMA( 2, 2 ); } report( "mc chroma :" ); #undef MC_TEST_LUMA @@ -970,6 +973,99 @@ static int check_mc( int cpu_ref, int cpu_new ) } report( "mc offsetsub :" ); + ok = 1; used_asm = 0; + if( mc_a.store_interleave_8x8x2 != mc_ref.store_interleave_8x8x2 ) + { + set_func_name( "store_interleave_8x8x2" ); + used_asm = 1; + memset( buf3, 0, 64*8 ); + memset( buf4, 0, 64*8 ); + call_c( mc_c.store_interleave_8x8x2, buf3, 64, buf1, buf1+16 ); + call_a( mc_a.store_interleave_8x8x2, buf4, 64, buf1, buf1+16 ); + if( memcmp( buf3, buf4, 64*8 ) ) + ok = 0; + } + if( mc_a.load_deinterleave_8x8x2 != mc_ref.load_deinterleave_8x8x2 ) + { + set_func_name( "load_deinterleave_8x8x2" ); + used_asm = 1; + call_c( mc_c.load_deinterleave_8x8x2, buf3, buf1, 64 ); + call_a( mc_a.load_deinterleave_8x8x2, buf4, buf1, 64 ); + if( memcmp( buf3, buf4, FENC_STRIDE*8 ) ) + ok = 0; + } + if( mc_a.load_deinterleave_9x1x2 != mc_ref.load_deinterleave_9x1x2 ) + { + set_func_name( "load_deinterleave_9x1x2" ); + used_asm = 1; + memset( buf3, 0, 32 ); + memset( buf4, 0, 32 ); + call_c( mc_c.load_deinterleave_9x1x2, buf3+7, buf3+16+7, buf1+14 ); + call_a( mc_a.load_deinterleave_9x1x2, buf4+7, buf4+16+7, buf1+14 ); + if( memcmp( buf3, buf4, 32 ) ) + ok = 0; + } + report( "store_interleave :" ); + + struct plane_spec { + int w, h, src_stride; + } plane_specs[] = { {2,2,2}, {8,6,8}, {20,31,24}, {32,8,40}, {256,10,272}, {504,7,505}, {528,6,528}, {256,10,-256}, {263,9,-264}, {1904,1,0} }; + ok = 1; used_asm = 0; + if( mc_a.plane_copy != mc_ref.plane_copy ) + { + set_func_name( "plane_copy" ); + used_asm = 1; + for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ ) + { + int w = plane_specs[i].w; + int h = plane_specs[i].h; + int src_stride = plane_specs[i].src_stride; + int dst_stride = (w + 127) & ~63; + assert( dst_stride * h <= 0x1000 ); + uint8_t *src = buf1 + X264_MAX(0, -src_stride) * (h-1); + memset( buf3, 0, 0x1000 ); + memset( buf4, 0, 0x1000 ); + call_c( mc_c.plane_copy, buf3, dst_stride, src, src_stride, w, h ); + call_a( mc_a.plane_copy, buf4, dst_stride, src, src_stride, w, h ); + for( int y = 0; y < h; y++ ) + if( memcmp( buf3+y*dst_stride, buf4+y*dst_stride, w ) ) + { + ok = 0; + fprintf( stderr, "plane_copy FAILED: w=%d h=%d stride=%d\n", w, h, src_stride ); + break; + } + } + } + report( "plane_copy :" ); + + ok = 1; used_asm = 0; + if( mc_a.plane_copy_interleave != mc_ref.plane_copy_interleave ) + { + set_func_name( "plane_copy_interleave" ); + used_asm = 1; + for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ ) + { + int w = (plane_specs[i].w + 1) >> 1; + int h = plane_specs[i].h; + int src_stride = (plane_specs[i].src_stride + 1) >> 1; + int dst_stride = (2*w + 127) & ~63; + assert( dst_stride * h <= 0x1000 ); + uint8_t *src = buf1 + X264_MAX(0, -src_stride) * (h-1); + memset( buf3, 0, 0x1000 ); + memset( buf4, 0, 0x1000 ); + call_c( mc_c.plane_copy_interleave, buf3, dst_stride, src, src_stride, src+1024, src_stride+16, w, h ); + call_a( mc_a.plane_copy_interleave, buf4, dst_stride, src, src_stride, src+1024, src_stride+16, w, h ); + for( int y = 0; y < h; y++ ) + if( memcmp( buf3+y*dst_stride, buf4+y*dst_stride, 2*w ) ) + { + ok = 0; + fprintf( stderr, "plane_copy_interleave FAILED: w=%d h=%d stride=%d\n", w, h, src_stride ); + break; + } + } + } + report( "plane_copy :" ); + if( mc_a.hpel_filter != mc_ref.hpel_filter ) { uint8_t *srchpel = buf1+8+2*64;