diff --git a/common/common.h b/common/common.h index 91d5030..c8d5863 100644 --- a/common/common.h +++ b/common/common.h @@ -635,16 +635,16 @@ struct x264_t ALIGNED_16( uint32_t fenc_satd_cache[32] ); /* pointer over mb of the frame to be compressed */ - uint8_t *p_fenc[3]; + uint8_t *p_fenc[3]; /* y,u,v */ /* pointer to the actual source frame, not a block copy */ - uint8_t *p_fenc_plane[3]; + uint8_t *p_fenc_plane[2]; /* y,uv */ /* pointer over mb of the frame to be reconstructed */ uint8_t *p_fdec[3]; /* pointer over mb of the references */ int i_fref[2]; - uint8_t *p_fref[2][32][4+2]; /* last: lN, lH, lV, lHV, cU, cV */ + uint8_t *p_fref[2][32][4+1]; /* last: yN, yH, yV, yHV, uv */ uint8_t *p_fref_w[32]; /* weighted fullpel luma */ uint16_t *p_integral[2][16]; @@ -778,7 +778,7 @@ struct x264_t /* Buffers that are allocated per-thread even in sliced threads. */ void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */ - uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */ + uint8_t *intra_border_backup[2][2]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */ /* CPU functions dependents */ x264_predict_t predict_16x16[4+3]; diff --git a/common/deblock.c b/common/deblock.c index f0203c2..27709e2 100755 --- a/common/deblock.c +++ b/common/deblock.c @@ -170,7 +170,8 @@ static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int pix += 2*ystride; continue; } - for( int d = 0; d < 2; d++ ) + for( int d = 0; d < 2; d++, pix += ystride-2 ) + for( int e = 0; e < 2; e++, pix++ ) { int p1 = pix[-2*xstride]; int p0 = pix[-1*xstride]; @@ -183,17 +184,16 @@ static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */ pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */ } - pix += ystride; } } } static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) { - deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 ); + deblock_chroma_c( pix, stride, 2, alpha, beta, tc0 ); } static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) { - deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 ); + deblock_chroma_c( pix, 2, stride, alpha, beta, tc0 ); } static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta ) @@ -248,9 +248,10 @@ static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int bet deblock_luma_intra_c( pix, 1, stride, alpha, beta ); } -static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta ) +static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int dir ) { - for( int d = 0; d < 8; d++ ) + for( int d = 0; d < (dir?16:8); d++, pix += ystride-2 ) + for( int e = 0; e < (dir?1:2); e++, pix++ ) { int p1 = pix[-2*xstride]; int p0 = pix[-1*xstride]; @@ -262,19 +263,18 @@ static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystrid pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */ pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */ } - pix += ystride; } } static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta ) { - deblock_chroma_intra_c( pix, stride, 1, alpha, beta ); + deblock_chroma_intra_c( pix, stride, 2, alpha, beta, 1 ); } static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta ) { - deblock_chroma_intra_c( pix, 1, stride, alpha, beta ); + deblock_chroma_intra_c( pix, 2, stride, alpha, beta, 0 ); } -static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter ) +static inline void deblock_edge( x264_t *h, uint8_t *pix, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter ) { int index_a = i_qp + h->sh.i_alpha_c0_offset; int alpha = alpha_table(index_a); @@ -289,12 +289,10 @@ static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_ tc[2] = tc0_table(index_a)[bS[2]] + b_chroma; tc[3] = tc0_table(index_a)[bS[3]] + b_chroma; - pf_inter( pix1, i_stride, alpha, beta, tc ); - if( b_chroma ) - pf_inter( pix2, i_stride, alpha, beta, tc ); + pf_inter( pix, i_stride, alpha, beta, tc ); } -static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra ) +static inline void deblock_edge_intra( x264_t *h, uint8_t *pix, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra ) { int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset); int beta = beta_table(i_qp + h->sh.i_beta_offset); @@ -302,9 +300,7 @@ static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, if( !alpha || !beta ) return; - pf_intra( pix1, i_stride, alpha, beta ); - if( b_chroma ) - pf_intra( pix2, i_stride, alpha, beta ); + pf_intra( pix, i_stride, alpha, beta ); } void x264_frame_deblock_row( x264_t *h, int mb_y ) @@ -347,13 +343,11 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) int i_qp = h->mb.qp[mb_xy]; int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4; uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x; - uint8_t *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x; - uint8_t *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x; + uint8_t *pixuv = h->fdec->plane[1] + 8*mb_y*strideuv + 16*mb_x; if( b_interlaced && (mb_y&1) ) { pixy -= 15*stridey; - pixu -= 7*strideuv; - pixv -= 7*strideuv; + pixuv -= 7*strideuv; } x264_prefetch_fenc( h, h->fdec, mb_x, mb_y ); @@ -368,14 +362,14 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) if( i_dir == 0 )\ {\ /* vertical edge */\ - deblock_edge##intra( h, pixy + 4*i_edge, NULL,\ + deblock_edge##intra( h, pixy + 4*i_edge,\ stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\ h->loopf.deblock_h_luma##intra );\ if( !(i_edge & 1) )\ {\ /* U/V planes */\ int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\ - deblock_edge##intra( h, pixu + 2*i_edge, pixv + 2*i_edge,\ + deblock_edge##intra( h, pixuv + 4*i_edge,\ stride2uv, bS, i_qpc, 1,\ h->loopf.deblock_h_chroma##intra );\ }\ @@ -383,14 +377,14 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) else\ {\ /* horizontal edge */\ - deblock_edge##intra( h, pixy + 4*i_edge*stride2y, NULL,\ + deblock_edge##intra( h, pixy + 4*i_edge*stride2y,\ stride2y, bS, (i_qp+i_qpn+1) >> 1, 0,\ h->loopf.deblock_v_luma##intra );\ /* U/V planes */\ if( !(i_edge & 1) )\ {\ int i_qpc = (h->chroma_qp_table[i_qp] + h->chroma_qp_table[i_qpn] + 1) >> 1;\ - deblock_edge##intra( h, pixu + 2*i_edge*stride2uv, pixv + 2*i_edge*stride2uv,\ + deblock_edge##intra( h, pixuv + 2*i_edge*stride2uv,\ stride2uv, bS, i_qpc, 1,\ h->loopf.deblock_v_chroma##intra );\ }\ @@ -506,21 +500,25 @@ void x264_frame_deblock( x264_t *h ) } #ifdef HAVE_MMX -void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); -void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); - void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_chroma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ); void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_v_chroma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ); #ifdef ARCH_X86 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); +// FIXME this wrapper has a significant cpu cost static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) { x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 ); @@ -560,22 +558,26 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) #ifdef HAVE_MMX if( cpu&X264_CPU_MMXEXT ) { - pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext; - pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext; - pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext; - pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext; #ifdef ARCH_X86 pf->deblock_v_luma = x264_deblock_v_luma_mmxext; pf->deblock_h_luma = x264_deblock_h_luma_mmxext; + pf->deblock_v_chroma = x264_deblock_v_chroma_mmxext; + pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext; pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext; pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext; + pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext; + pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext; #endif if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_STACK_MOD4) ) { pf->deblock_v_luma = x264_deblock_v_luma_sse2; pf->deblock_h_luma = x264_deblock_h_luma_sse2; + pf->deblock_v_chroma = x264_deblock_v_chroma_sse2; + pf->deblock_h_chroma = x264_deblock_h_chroma_sse2; pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2; pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2; + pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_sse2; + pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_sse2; } } #endif @@ -593,8 +595,8 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) { pf->deblock_v_luma = x264_deblock_v_luma_neon; pf->deblock_h_luma = x264_deblock_h_luma_neon; - pf->deblock_v_chroma = x264_deblock_v_chroma_neon; - pf->deblock_h_chroma = x264_deblock_h_chroma_neon; +// pf->deblock_v_chroma = x264_deblock_v_chroma_neon; +// pf->deblock_h_chroma = x264_deblock_h_chroma_neon; } #endif } diff --git a/common/frame.c b/common/frame.c index aa10773..e4f5d89 100644 --- a/common/frame.c +++ b/common/frame.c @@ -44,20 +44,12 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) i_stride = ALIGN( i_width + 2*PADH, align ); i_lines = ALIGN( h->param.i_height, 16<param.b_interlaced ); - frame->i_plane = 3; - for( int i = 0; i < 3; i++ ) + frame->i_plane = 2; + for( int i = 0; i < 2; i++ ) { - frame->i_stride[i] = ALIGN( i_stride >> !!i, align ); - frame->i_width[i] = i_width >> !!i; - frame->i_lines[i] = i_lines >> !!i; - } - - luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv)); - chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv)); - for( int i = 1; i < 3; i++ ) - { - CHECKED_MALLOC( frame->buffer[i], chroma_plane_size ); - frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2; + frame->i_stride[i] = ALIGN( i_stride, align ); + frame->i_width[i] = i_width >> i; + frame->i_lines[i] = i_lines >> i; } for( int i = 0; i < h->param.i_bframe + 2; i++ ) @@ -83,6 +75,12 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) frame->orig = frame; + luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv)); + chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + i_padv)); + + CHECKED_MALLOC( frame->buffer[1], chroma_plane_size ); + frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * i_padv/2 + PADH; + /* all 4 luma planes allocated together, since the cacheline split code * requires them to be in-phase wrt cacheline alignment. */ if( h->param.analyse.i_subpel_refine && b_fdec ) @@ -216,10 +214,31 @@ void x264_frame_delete( x264_frame_t *frame ) x264_free( frame ); } +static int get_plane_ptr( x264_t *h, x264_picture_t *src, uint8_t **pix, int *stride, int plane, int xshift, int yshift ) +{ + int width = h->param.i_width >> xshift; + int height = h->param.i_height >> yshift; + *pix = src->img.plane[plane]; + *stride = src->img.i_stride[plane]; + if( src->img.i_csp & X264_CSP_VFLIP ) + { + *pix += (height-1) * *stride; + *stride = -*stride; + } + if( width > abs(*stride) ) + { + x264_log( h, X264_LOG_ERROR, "Input picture width (%d) is greater than stride (%d)\n", width, *stride ); + return -1; + } + return 0; +} + +#define get_plane_ptr(...) do{ if( get_plane_ptr(__VA_ARGS__) < 0 ) return -1; }while(0) + int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src ) { int i_csp = src->img.i_csp & X264_CSP_MASK; - if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 ) + if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 && i_csp != X264_CSP_NV12 ) { x264_log( h, X264_LOG_ERROR, "Invalid input colorspace\n" ); return -1; @@ -231,39 +250,47 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src ) dst->param = src->param; dst->i_pic_struct = src->i_pic_struct; - for( int i = 0; i < 3; i++ ) + uint8_t *pix[3]; + int stride[3]; + get_plane_ptr( h, src, &pix[0], &stride[0], 0, 0, 0 ); + h->mc.plane_copy( dst->plane[0], dst->i_stride[0], pix[0], stride[0], + h->param.i_width, h->param.i_height ); + if( i_csp == X264_CSP_NV12 ) { - int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i; - uint8_t *plane = src->img.plane[s]; - int stride = src->img.i_stride[s]; - int width = h->param.i_width >> !!i; - int height = h->param.i_height >> !!i; - if( src->img.i_csp & X264_CSP_VFLIP ) - { - plane += (height-1)*stride; - stride = -stride; - } - if( width > abs(stride) ) - { - x264_log( h, X264_LOG_ERROR, "Input picture width is greater than stride\n" ); - return -1; - } - h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height ); + get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, 1 ); + h->mc.plane_copy( dst->plane[1], dst->i_stride[1], pix[1], stride[1], + h->param.i_width, h->param.i_height>>1 ); + } + else + { + get_plane_ptr( h, src, &pix[1], &stride[1], i_csp==X264_CSP_I420 ? 1 : 2, 1, 1 ); + get_plane_ptr( h, src, &pix[2], &stride[2], i_csp==X264_CSP_I420 ? 2 : 1, 1, 1 ); + h->mc.plane_copy_interleave( dst->plane[1], dst->i_stride[1], + pix[1], stride[1], pix[2], stride[2], + h->param.i_width>>1, h->param.i_height>>1 ); } return 0; } - - -static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom ) +static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom, int b_chroma ) { #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride ) for( int y = 0; y < i_height; y++ ) { - /* left band */ - memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh ); - /* right band */ - memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh ); + if( b_chroma ) + { + for( int x = 0; x < i_padh; x+=2 ) + CP16( PPIXEL(-x, y), PPIXEL(0, y) ); + for( int x = 0; x < i_padh; x+=2 ) + CP16( PPIXEL(i_width+x, y), PPIXEL(i_width-2, y) ); + } + else + { + /* left band */ + memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh ); + /* right band */ + memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh ); + } } /* upper band */ if( b_pad_top ) @@ -284,9 +311,9 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e for( int i = 0; i < frame->i_plane; i++ ) { int stride = frame->i_stride[i]; - int width = 16*h->sps->i_mb_width >> !!i; + int width = 16*h->sps->i_mb_width; int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i; - int padh = PADH >> !!i; + int padh = PADH; int padv = PADV >> !!i; // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i); @@ -294,12 +321,12 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e height += 4 >> (!!i + h->sh.b_mbaff); if( h->sh.b_mbaff ) { - plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end ); - plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end ); + plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, i ); + plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, i ); } else { - plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end ); + plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, i ); } } } @@ -321,36 +348,40 @@ void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4; if( h->sh.b_mbaff ) { - plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end ); - plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end ); + plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, 0 ); + plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, 0 ); } else - plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end ); + plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, 0 ); } } void x264_frame_expand_border_lowres( x264_frame_t *frame ) { for( int i = 0; i < 4; i++ ) - plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 ); + plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1, 0 ); } void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame ) { for( int i = 0; i < frame->i_plane; i++ ) { - int i_subsample = i ? 1 : 0; - int i_width = h->param.i_width >> i_subsample; - int i_height = h->param.i_height >> i_subsample; - int i_padx = (h->sps->i_mb_width * 16 - h->param.i_width) >> i_subsample; - int i_pady = (h->sps->i_mb_height * 16 - h->param.i_height) >> i_subsample; + int i_width = h->param.i_width; + int i_height = h->param.i_height >> !!i; + int i_padx = (h->sps->i_mb_width * 16 - h->param.i_width); + int i_pady = (h->sps->i_mb_height * 16 - h->param.i_height) >> !!i; if( i_padx ) { for( int y = 0; y < i_height; y++ ) - memset( &frame->plane[i][y*frame->i_stride[i] + i_width], - frame->plane[i][y*frame->i_stride[i] + i_width - 1], - i_padx ); + if( i ) + for( int x = 0; x < i_padx; x+=2 ) + CP16( &frame->plane[i][y*frame->i_stride[i] + i_width + x], + &frame->plane[i][y*frame->i_stride[i] + i_width - 2] ); + else + memset( &frame->plane[i][y*frame->i_stride[i] + i_width], + frame->plane[i][y*frame->i_stride[i] + i_width - 1], + i_padx ); } if( i_pady ) { diff --git a/common/frame.h b/common/frame.h index 357929e..56520d8 100644 --- a/common/frame.h +++ b/common/frame.h @@ -58,13 +58,13 @@ typedef struct x264_frame /* YUV buffer */ int i_plane; - int i_stride[3]; - int i_width[3]; - int i_lines[3]; + int i_stride[2]; + int i_width[2]; + int i_lines[2]; int i_stride_lowres; int i_width_lowres; int i_lines_lowres; - uint8_t *plane[3]; + uint8_t *plane[2]; uint8_t *filtered[4]; /* plane[0], H, V, HV */ uint8_t *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */ uint16_t *integral; diff --git a/common/macroblock.c b/common/macroblock.c index f402588..601d806 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -40,7 +40,8 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h if( h->mb.b_interlaced & i_ref ) mvy += (h->mb.i_mb_y & 1)*4 - 2; - h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, + h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], + &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1], mvx, mvy, 2*width, 2*height ); @@ -48,11 +49,6 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, &h->sh.weight[i_ref][1], height*2 ); - - h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, - h->mb.pic.p_fref[0][i_ref][5], h->mb.pic.i_stride[2], - mvx, mvy, 2*width, 2*height ); - if( h->sh.weight[i_ref][2].weightfn ) h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, @@ -73,13 +69,10 @@ static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int h if( h->mb.b_interlaced & i_ref ) mvy += (h->mb.i_mb_y & 1)*4 - 2; - h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, + h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], + &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1], mvx, mvy, 2*width, 2*height ); - - h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, - h->mb.pic.p_fref[1][i_ref][5], h->mb.pic.i_stride[2], - mvx, mvy, 2*width, 2*height ); } static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height ) @@ -110,16 +103,12 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int if( h->mb.b_interlaced & i_ref1 ) mvy1 += (h->mb.i_mb_y & 1)*4 - 2; - h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1], + h->mc.mc_chroma( tmp0, tmp0+8, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1], mvx0, mvy0, 2*width, 2*height ); - h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1], + h->mc.mc_chroma( tmp1, tmp1+8, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1], mvx1, mvy1, 2*width, 2*height ); h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); - h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][5], h->mb.pic.i_stride[2], - mvx0, mvy0, 2*width, 2*height ); - h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][5], h->mb.pic.i_stride[2], - mvx1, mvy1, 2*width, 2*height ); - h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); + h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight ); } void x264_mb_mc_8x8( x264_t *h, int i8 ) @@ -324,11 +313,11 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead ) { if( !b_lookahead ) for( int i = 0; i <= h->param.b_interlaced; i++ ) - for( int j = 0; j < 3; j++ ) + for( int j = 0; j < 2; j++ ) { /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */ - CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j ); - h->intra_border_backup[i][j] += 8; + CHECKED_MALLOCZERO( h->intra_border_backup[i][j], h->sps->i_mb_width*16+32 ); + h->intra_border_backup[i][j] += 16; } /* Allocate scratch buffer */ @@ -355,8 +344,8 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead ) { if( !b_lookahead ) for( int i = 0; i <= h->param.b_interlaced; i++ ) - for( int j = 0; j < 3; j++ ) - x264_free( h->intra_border_backup[i][j] - 8 ); + for( int j = 0; j < 2; j++ ) + x264_free( h->intra_border_backup[i][j] - 16 ); x264_free( h->scratch_buffer ); } @@ -457,13 +446,13 @@ void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y ) { int stride_y = fenc->i_stride[0]; int stride_uv = fenc->i_stride[1]; - int off_y = 16 * (i_mb_x + i_mb_y * stride_y); - int off_uv = 8 * (i_mb_x + i_mb_y * stride_uv); + int off_y = 16 * i_mb_x + 16 * i_mb_y * stride_y; + int off_uv = 16 * i_mb_x + 8 * i_mb_y * stride_uv; h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y, - fenc->plane[1+(i_mb_x&1)]+off_uv, stride_uv, i_mb_x ); + fenc->plane[1]+off_uv, stride_uv, i_mb_x ); } -static NOINLINE void copy_column8( uint8_t *dst, uint8_t *src ) +NOINLINE void x264_copy_column8( uint8_t *dst, uint8_t *src ) { // input pointers are offset by 4 rows because that's faster (smaller instruction size on x86) for( int i = -4; i < 4; i++ ) @@ -472,30 +461,44 @@ static NOINLINE void copy_column8( uint8_t *dst, uint8_t *src ) static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x, int mb_y, int i ) { - const int w = (i == 0 ? 16 : 8); - const int i_stride = h->fdec->i_stride[!!i]; - const int i_stride2 = i_stride << h->mb.b_interlaced; - const int i_pix_offset = h->mb.b_interlaced - ? w * (mb_x + (mb_y&~1) * i_stride) + (mb_y&1) * i_stride - : w * (mb_x + mb_y * i_stride); - const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset]; - const uint8_t *intra_fdec = &h->intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16>>!!i]; + int w = (i ? 8 : 16); + int i_stride = h->fdec->i_stride[i]; + int i_stride2 = i_stride << h->mb.b_interlaced; + int i_pix_offset = h->mb.b_interlaced + ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride + : 16 * mb_x + w * mb_y * i_stride; + uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset]; + uint8_t *intra_fdec = &h->intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16]; int ref_pix_offset[2] = { i_pix_offset, i_pix_offset }; x264_frame_t **fref[2] = { h->fref0, h->fref1 }; if( h->mb.b_interlaced ) ref_pix_offset[1] += (1-2*(mb_y&1)) * i_stride; h->mb.pic.i_stride[i] = i_stride2; h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset]; - h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE, - h->mb.pic.p_fenc_plane[i], i_stride2, w ); - memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 ); + if( i ) + { + h->mc.load_deinterleave_8x8x2_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2 ); + CP64( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec ); + CP64( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8 ); + } + else + { + h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fenc_plane[0], i_stride2, 16 ); + memcpy( h->mb.pic.p_fdec[0]-FDEC_STRIDE, intra_fdec, 24 ); + } if( h->mb.b_interlaced ) for( int j = 0; j < w; j++ ) - h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; + if( i ) + { + h->mb.pic.p_fdec[1][-1+j*FDEC_STRIDE] = plane_fdec[-2+j*i_stride2]; + h->mb.pic.p_fdec[2][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; + } + else + h->mb.pic.p_fdec[0][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; for( int j = 0; j < h->mb.pic.i_fref[0]; j++ ) { - h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &fref[0][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]]; - if( i == 0 ) + h->mb.pic.p_fref[0][j][i?4:0] = &fref[0][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]]; + if( !i ) { for( int k = 1; k < 4; k++ ) h->mb.pic.p_fref[0][j][k] = &fref[0][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]]; @@ -508,8 +511,8 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x if( h->sh.i_type == SLICE_TYPE_B ) for( int j = 0; j < h->mb.pic.i_fref[1]; j++ ) { - h->mb.pic.p_fref[1][j][i==0 ? 0:i+3] = &fref[1][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]]; - if( i == 0 ) + h->mb.pic.p_fref[1][j][i?4:0] = &fref[1][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]]; + if( !i ) for( int k = 1; k < 4; k++ ) h->mb.pic.p_fref[1][j][k] = &fref[1][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]]; } @@ -721,16 +724,15 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y ) if( !h->mb.b_interlaced ) { - copy_column8( h->mb.pic.p_fdec[0]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+ 4*FDEC_STRIDE ); - copy_column8( h->mb.pic.p_fdec[0]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+12*FDEC_STRIDE ); - copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+ 4*FDEC_STRIDE ); - copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+ 4*FDEC_STRIDE ); + x264_copy_column8( h->mb.pic.p_fdec[0]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+ 4*FDEC_STRIDE ); + x264_copy_column8( h->mb.pic.p_fdec[0]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+12*FDEC_STRIDE ); + x264_copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+ 4*FDEC_STRIDE ); + x264_copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+ 4*FDEC_STRIDE ); } /* load picture pointers */ x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 0 ); x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1 ); - x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 2 ); if( h->fdec->integral ) { @@ -871,16 +873,29 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y ) | ((h->mb.i_neighbour_intra & MB_TOP) ? MB_TOP|MB_TOPLEFT : 0); } -static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i ) +static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb_y, int i ) { int w = i ? 8 : 16; - int i_stride = h->fdec->i_stride[!!i]; + int i_stride = h->fdec->i_stride[i]; int i_stride2 = i_stride << h->mb.b_interlaced; int i_pix_offset = h->mb.b_interlaced - ? w * (h->mb.i_mb_x + (h->mb.i_mb_y&~1) * i_stride) + (h->mb.i_mb_y&1) * i_stride - : w * (h->mb.i_mb_x + h->mb.i_mb_y * i_stride); - h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( &h->fdec->plane[i][i_pix_offset], i_stride2, - h->mb.pic.p_fdec[i], FDEC_STRIDE, w ); + ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride + : 16 * mb_x + w * mb_y * i_stride; + uint8_t *intra_fdec = &h->intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16]; + if( i ) + { + h->mc.store_interleave_8x8x2( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2] ); + CP64( intra_fdec, h->mb.pic.p_fdec[1]+FDEC_STRIDE*7 ); + CP64( intra_fdec+8, h->mb.pic.p_fdec[2]+FDEC_STRIDE*7 ); + h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[1][-FDEC_STRIDE+7]; + h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[2][-FDEC_STRIDE+7]; + } + else + { + h->mc.copy[PIXEL_16x16]( &h->fdec->plane[0][i_pix_offset], i_stride2, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 ); + CP128( intra_fdec, h->mb.pic.p_fdec[0]+FDEC_STRIDE*15 ); + h->mb.pic.p_fdec[0][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[0][-FDEC_STRIDE+15]; + } } void x264_macroblock_cache_save( x264_t *h ) @@ -897,9 +912,8 @@ void x264_macroblock_cache_save( x264_t *h ) int8_t *i4x4 = h->mb.intra4x4_pred_mode[i_mb_xy]; uint8_t *nnz = h->mb.non_zero_count[i_mb_xy]; - x264_macroblock_store_pic( h, 0 ); - x264_macroblock_store_pic( h, 1 ); - x264_macroblock_store_pic( h, 2 ); + x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0 ); + x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1 ); x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y ); diff --git a/common/macroblock.h b/common/macroblock.h index ee8c113..2bd55cc 100644 --- a/common/macroblock.h +++ b/common/macroblock.h @@ -277,6 +277,8 @@ void x264_macroblock_bipred_init( x264_t *h ); void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y ); +void x264_copy_column8( uint8_t *dst, uint8_t *src ); + /* x264_mb_predict_mv_16x16: * set mvp with predicted mv for D_16x16 block * h->mb. need only valid values from other blocks */ diff --git a/common/mc.c b/common/mc.c index ad7fe79..55e0e0e 100644 --- a/common/mc.c +++ b/common/mc.c @@ -260,7 +260,7 @@ static uint8_t *get_ref( uint8_t *dst, int *i_dst_stride, } /* full chroma mc (ie until 1/8 pixel)*/ -static void mc_chroma( uint8_t *dst, int i_dst_stride, +static void mc_chroma( uint8_t *dstu, uint8_t *dstv, int i_dst_stride, uint8_t *src, int i_src_stride, int mvx, int mvy, int i_width, int i_height ) @@ -274,14 +274,20 @@ static void mc_chroma( uint8_t *dst, int i_dst_stride, int cC = (8-d8x)*d8y; int cD = d8x *d8y; - src += (mvy >> 3) * i_src_stride + (mvx >> 3); + src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2; srcp = &src[i_src_stride]; for( int y = 0; y < i_height; y++ ) { for( int x = 0; x < i_width; x++ ) - dst[x] = ( cA*src[x] + cB*src[x+1] + cC*srcp[x] + cD*srcp[x+1] + 32 ) >> 6; - dst += i_dst_stride; + { + dstu[x] = ( cA*src[2*x] + cB*src[2*x+2] + + cC*srcp[2*x] + cD*srcp[2*x+2] + 32 ) >> 6; + dstv[x] = ( cA*src[2*x+1] + cB*src[2*x+3] + + cC*srcp[2*x+1] + cD*srcp[2*x+3] + 32 ) >> 6; + } + dstu += i_dst_stride; + dstv += i_dst_stride; src = srcp; srcp += i_src_stride; } @@ -297,7 +303,7 @@ MC_COPY( 8 ) MC_COPY( 4 ) void x264_plane_copy_c( uint8_t *dst, int i_dst, - uint8_t *src, int i_src, int w, int h) + uint8_t *src, int i_src, int w, int h ) { while( h-- ) { @@ -307,6 +313,45 @@ void x264_plane_copy_c( uint8_t *dst, int i_dst, } } +void x264_plane_copy_interleave_c( uint8_t *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ) +{ + for( int y=0; ycopy[PIXEL_8x8] = mc_copy_w8; pf->copy[PIXEL_4x4] = mc_copy_w4; + pf->store_interleave_8x8x2 = store_interleave_8x8x2; + pf->load_deinterleave_8x8x2_fenc = load_deinterleave_8x8x2_fenc; + pf->load_deinterleave_8x8x2_fdec = load_deinterleave_8x8x2_fdec; + pf->plane_copy = x264_plane_copy_c; + pf->plane_copy_interleave = x264_plane_copy_interleave_c; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c; + pf->hpel_filter = hpel_filter; pf->prefetch_fenc = prefetch_fenc_null; diff --git a/common/mc.h b/common/mc.h index 68bba48..3f88fcc 100644 --- a/common/mc.h +++ b/common/mc.h @@ -68,7 +68,7 @@ typedef struct /* mc_chroma may write up to 2 bytes of garbage to the right of dst, * so it must be run from left to right. */ - void (*mc_chroma)(uint8_t *dst, int i_dst, uint8_t *src, int i_src, + void (*mc_chroma)(uint8_t *dstu, uint8_t *dstv, int i_dst, uint8_t *src, int i_src, int mvx, int mvy, int i_width, int i_height ); @@ -78,8 +78,18 @@ typedef struct void (*copy[7])( uint8_t *dst, int, uint8_t *src, int, int i_height ); void (*copy_16x16_unaligned)( uint8_t *dst, int, uint8_t *src, int, int i_height ); + void (*store_interleave_8x8x2)( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv ); + void (*load_deinterleave_8x8x2_fenc)( uint8_t *dst, uint8_t *src, int i_src ); + void (*load_deinterleave_8x8x2_fdec)( uint8_t *dst, uint8_t *src, int i_src ); + void (*plane_copy)( uint8_t *dst, int i_dst, - uint8_t *src, int i_src, int w, int h); + uint8_t *src, int i_src, int w, int h ); + void (*plane_copy_interleave)( uint8_t *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ); + void (*plane_copy_deinterleave)( uint8_t *dstu, int i_dstu, + uint8_t *dstv, int i_dstv, + uint8_t *src, int i_src, int w, int h ); void (*hpel_filter)( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int i_stride, int i_width, int i_height, int16_t *buf ); diff --git a/common/pixel.c b/common/pixel.c index 20c5170..246d999 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -96,9 +96,9 @@ PIXEL_SSD_C( x264_pixel_ssd_8x4, 8, 4 ) PIXEL_SSD_C( x264_pixel_ssd_4x8, 4, 8 ) PIXEL_SSD_C( x264_pixel_ssd_4x4, 4, 4 ) -int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height ) +uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height ) { - int64_t i_ssd = 0; + uint64_t i_ssd = 0; int y; int align = !(((intptr_t)pix1 | (intptr_t)pix2 | i_pix1 | i_pix2) & 15); @@ -136,6 +136,31 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1 return i_ssd; } +static uint64_t pixel_ssd_nv12_core( uint8_t *pixuv1, int stride1, uint8_t *pixuv2, int stride2, int width, int height ) +{ + uint32_t ssd_u=0, ssd_v=0; + for( int y = 0; y < height; y++, pixuv1+=stride1, pixuv2+=stride2 ) + for( int x = 0; x < width; x++ ) + { + int du = pixuv1[2*x] - pixuv2[2*x]; + int dv = pixuv1[2*x+1] - pixuv2[2*x+1]; + ssd_u += du*du; + ssd_v += dv*dv; + } + return ssd_u + ((uint64_t)ssd_v<<32); +} + +// SSD in uint32 (i.e. packing two into uint64) can potentially overflow on +// image widths >= 11008 (or 6604 if interlaced), since this is called on blocks +// of height up to 12 (resp 20). Though it will probably take significantly more +// than that at sane distortion levels. +uint64_t x264_pixel_ssd_nv12( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height ) +{ + uint64_t ssd = pf->ssd_nv12_core( pix1, i_pix1, pix2, i_pix2, i_width&~7, i_height ); + if( i_width&7 ) + ssd += pixel_ssd_nv12_core( pix1+(i_width&~7), i_pix1, pix2+(i_width&~7), i_pix2, i_width&7, i_height ); + return ssd; +} /**************************************************************************** * pixel_var_wxh @@ -632,6 +657,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->var[PIXEL_16x16] = x264_pixel_var_16x16; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8; + pixf->ssd_nv12_core = pixel_ssd_nv12_core; pixf->ssim_4x4x2_core = ssim_4x4x2_core; pixf->ssim_end4 = ssim_end4; pixf->var2_8x8 = pixel_var2_8x8; @@ -655,6 +681,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT_ADS( _mmxext ); pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmxext; + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmxext; #ifdef ARCH_X86 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext; @@ -700,6 +727,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT5( ssd, _sse2slow ); INIT2_NAME( sad_aligned, sad, _sse2_aligned ); pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2; + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2; pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; diff --git a/common/pixel.h b/common/pixel.h index 1102642..7747e32 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -78,6 +78,8 @@ typedef struct uint64_t (*var[4])( uint8_t *pix, int stride ); uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride ); + uint64_t (*ssd_nv12_core)( uint8_t *pixuv1, int stride1, + uint8_t *pixuv2, int stride2, int width, int height ); void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1, const uint8_t *pix2, int stride2, int sums[2][4] ); float (*ssim_end4)( int sum0[5][4], int sum1[5][4], int width ); @@ -110,7 +112,8 @@ typedef struct } x264_pixel_function_t; void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ); -int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height ); +uint64_t x264_pixel_ssd_nv12( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height ); +uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height ); float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height, void *buf ); #endif diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index 53b57f6..227502a 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -21,6 +21,7 @@ ;***************************************************************************** %include "x86inc.asm" +%include "x86util.asm" SECTION .text @@ -34,74 +35,123 @@ cextern pb_a1 [base], [base+stride], [base+stride*2], [base3], \ [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] -; in: 8 rows of 4 bytes in %1..%8 +%define PASS8ROWS(base, base3, stride, stride3, offset) \ + PASS8ROWS(base+offset, base3+offset, stride, stride3) + +; in: 8 rows of 4 bytes in %4..%11 ; out: 4 rows of 8 bytes in m0..m3 -%macro TRANSPOSE4x8_LOAD 8 - movd m0, %1 - movd m2, %2 - movd m1, %3 - movd m3, %4 - punpcklbw m0, m2 - punpcklbw m1, m3 - movq m2, m0 - punpcklwd m0, m1 - punpckhwd m2, m1 - - movd m4, %5 - movd m6, %6 - movd m5, %7 - movd m7, %8 - punpcklbw m4, m6 - punpcklbw m5, m7 - movq m6, m4 - punpcklwd m4, m5 - punpckhwd m6, m5 - - movq m1, m0 - movq m3, m2 - punpckldq m0, m4 - punpckhdq m1, m4 - punpckldq m2, m6 - punpckhdq m3, m6 +%macro TRANSPOSE4x8_LOAD 11 + movh m0, %4 + movh m2, %5 + movh m1, %6 + movh m3, %7 + punpckl%1 m0, m2 + punpckl%1 m1, m3 + mova m2, m0 + punpckl%2 m0, m1 + punpckh%2 m2, m1 + + movh m4, %8 + movh m6, %9 + movh m5, %10 + movh m7, %11 + punpckl%1 m4, m6 + punpckl%1 m5, m7 + mova m6, m4 + punpckl%2 m4, m5 + punpckh%2 m6, m5 + + mova m1, m0 + mova m3, m2 + punpckl%3 m0, m4 + punpckh%3 m1, m4 + punpckl%3 m2, m6 + punpckh%3 m3, m6 %endmacro ; in: 4 rows of 8 bytes in m0..m3 ; out: 8 rows of 4 bytes in %1..%8 -%macro TRANSPOSE8x4_STORE 8 - movq m4, m0 - movq m5, m1 - movq m6, m2 +%macro TRANSPOSE8x4B_STORE 8 + mova m4, m0 + mova m5, m1 + mova m6, m2 punpckhdq m4, m4 punpckhdq m5, m5 punpckhdq m6, m6 punpcklbw m0, m1 punpcklbw m2, m3 - movq m1, m0 + mova m1, m0 punpcklwd m0, m2 punpckhwd m1, m2 - movd %1, m0 + movh %1, m0 punpckhdq m0, m0 - movd %2, m0 - movd %3, m1 + movh %2, m0 + movh %3, m1 punpckhdq m1, m1 - movd %4, m1 + movh %4, m1 punpckhdq m3, m3 punpcklbw m4, m5 punpcklbw m6, m3 - movq m5, m4 + mova m5, m4 punpcklwd m4, m6 punpckhwd m5, m6 - movd %5, m4 + movh %5, m4 punpckhdq m4, m4 - movd %6, m4 - movd %7, m5 + movh %6, m4 + movh %7, m5 punpckhdq m5, m5 - movd %8, m5 + movh %8, m5 +%endmacro + +%macro TRANSPOSE4x8B_LOAD 8 + TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 +%endmacro + +%macro TRANSPOSE4x8W_LOAD 8 +%if mmsize==16 + TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8 +%else + SWAP 1, 4, 2, 3 + mova m0, [t5] + mova m1, [t5+r1] + mova m2, [t5+r1*2] + mova m3, [t5+t6] + TRANSPOSE4x4W 0, 1, 2, 3, 4 +%endif +%endmacro + +%macro TRANSPOSE8x2W_STORE 8 + mova m0, m1 + punpcklwd m1, m2 + punpckhwd m0, m2 +%if mmsize==8 + movd %1, m1 + movd %3, m0 + psrlq m1, 32 + psrlq m0, 32 + movd %2, m1 + movd %4, m0 +%else + movd %1, m1 + movd %5, m0 + psrldq m1, 4 + psrldq m0, 4 + movd %2, m1 + movd %6, m0 + psrldq m1, 4 + psrldq m0, 4 + movd %3, m1 + movd %7, m0 + psrldq m1, 4 + psrldq m0, 4 + movd %4, m1 + movd %8, m0 +%endif %endmacro -%macro SBUTTERFLY 4 +%macro SBUTTERFLY0 4 movq %4, %2 punpckl%1 %2, %3 punpckh%1 %4, %3 @@ -110,6 +160,7 @@ cextern pb_a1 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] %macro TRANSPOSE6x8_MEM 9 + RESET_MM_PERMUTATION movq m0, %1 movq m1, %2 movq m2, %3 @@ -117,30 +168,32 @@ cextern pb_a1 movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY bw, m0, m1, m7 - SBUTTERFLY bw, m2, m3, m1 - SBUTTERFLY bw, m4, m5, m3 - movq [%9+0x10], m1 - SBUTTERFLY bw, m6, %8, m5 - SBUTTERFLY wd, m0, m2, m1 - SBUTTERFLY wd, m4, m6, m2 + SBUTTERFLY bw, 0, 1, 7 + SBUTTERFLY bw, 2, 3, 7 + SBUTTERFLY bw, 4, 5, 7 + movq [%9+0x10], m3 + SBUTTERFLY0 bw, m6, %8, m7 + SBUTTERFLY wd, 0, 2, 3 + SBUTTERFLY wd, 4, 6, 3 punpckhdq m0, m4 movq [%9+0x00], m0 - SBUTTERFLY wd, m7, [%9+0x10], m6 - SBUTTERFLY wd, m3, m5, m4 - SBUTTERFLY dq, m7, m3, m0 - SBUTTERFLY dq, m1, m2, m5 - punpckldq m6, m4 - movq [%9+0x10], m1 - movq [%9+0x20], m5 - movq [%9+0x30], m7 - movq [%9+0x40], m0 - movq [%9+0x50], m6 + SBUTTERFLY0 wd, m1, [%9+0x10], m3 + SBUTTERFLY wd, 5, 7, 0 + SBUTTERFLY dq, 1, 5, 0 + SBUTTERFLY dq, 2, 6, 0 + punpckldq m3, m7 + movq [%9+0x10], m2 + movq [%9+0x20], m6 + movq [%9+0x30], m1 + movq [%9+0x40], m5 + movq [%9+0x50], m3 + RESET_MM_PERMUTATION %endmacro ; in: 8 rows of 8 in %1..%8 ; out: 8 rows of 8 in %9..%16 %macro TRANSPOSE8x8_MEM 16 + RESET_MM_PERMUTATION movq m0, %1 movq m1, %2 movq m2, %3 @@ -148,29 +201,30 @@ cextern pb_a1 movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY bw, m0, m1, m7 - SBUTTERFLY bw, m2, m3, m1 - SBUTTERFLY bw, m4, m5, m3 - SBUTTERFLY bw, m6, %8, m5 - movq %9, m3 - SBUTTERFLY wd, m0, m2, m3 - SBUTTERFLY wd, m4, m6, m2 - SBUTTERFLY wd, m7, m1, m6 - movq %11, m2 - movq m2, %9 - SBUTTERFLY wd, m2, m5, m1 - SBUTTERFLY dq, m0, m4, m5 - SBUTTERFLY dq, m7, m2, m4 + SBUTTERFLY bw, 0, 1, 7 + SBUTTERFLY bw, 2, 3, 7 + SBUTTERFLY bw, 4, 5, 7 + SBUTTERFLY0 bw, m6, %8, m7 + movq %9, m5 + SBUTTERFLY wd, 0, 2, 5 + SBUTTERFLY wd, 4, 6, 5 + SBUTTERFLY wd, 1, 3, 5 + movq %11, m6 + movq m6, %9 + SBUTTERFLY wd, 6, 7, 5 + SBUTTERFLY dq, 0, 4, 5 + SBUTTERFLY dq, 1, 6, 5 movq %9, m0 - movq %10, m5 - movq %13, m7 - movq %14, m4 - SBUTTERFLY dq, m3, %11, m0 - SBUTTERFLY dq, m6, m1, m5 - movq %11, m3 + movq %10, m4 + movq %13, m1 + movq %14, m6 + SBUTTERFLY0 dq, m2, %11, m0 + SBUTTERFLY dq, 3, 7, 4 + movq %11, m2 movq %12, m0 - movq %15, m6 - movq %16, m5 + movq %15, m3 + movq %16, m7 + RESET_MM_PERMUTATION %endmacro ; out: %4 = |%1-%2|>%3 @@ -359,7 +413,7 @@ cglobal deblock_h_luma_sse2, 5,7 movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) shl r10, 3 sub r6, r10 @@ -369,7 +423,7 @@ cglobal deblock_h_luma_sse2, 5,7 movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) %ifdef WIN64 add rsp, 0x98 @@ -478,7 +532,7 @@ cglobal deblock_h_luma_%1, 0,5 movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) lea r0, [r0+r3*8] lea r1, [r1+r3*8] @@ -486,7 +540,7 @@ cglobal deblock_h_luma_%1, 0,5 movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) ADD esp, pad RET @@ -761,117 +815,152 @@ DEBLOCK_LUMA_INTRA mmxext, v8 -INIT_MMX - %macro CHROMA_V_START 0 dec r2d ; alpha-1 dec r3d ; beta-1 mov t5, r0 sub t5, r1 sub t5, r1 +%if mmsize==8 + mov dword r0m, 2 +.skip_prologue: +%endif %endmacro %macro CHROMA_H_START 0 dec r2d dec r3d - sub r0, 2 + sub r0, 4 lea t6, [r1*3] mov t5, r0 add r0, t6 +%if mmsize==8 + mov dword r0m, 2 +.skip_prologue: +%endif +%endmacro + +%macro CHROMA_V_LOOP 1 +%if mmsize==8 + add r0, 8 + add t5, 8 +%if %1 + add r4, 2 +%endif + dec dword r0m + jg .skip_prologue +%endif +%endmacro + +%macro CHROMA_H_LOOP 1 +%if mmsize==8 + lea r0, [r0+r1*4] + lea t5, [t5+r1*4] +%if %1 + add r4, 2 +%endif + dec dword r0m + jg .skip_prologue +%endif %endmacro %define t5 r5 %define t6 r6 +%macro DEBLOCK_CHROMA 1 ;----------------------------------------------------------------------------- ; void deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_v_chroma_mmxext, 5,6 +cglobal deblock_v_chroma_%1, 5,6,8 CHROMA_V_START - movq m0, [t5] - movq m1, [t5+r1] - movq m2, [r0] - movq m3, [r0+r1] - call chroma_inter_body_mmxext - movq [t5+r1], m1 - movq [r0], m2 + mova m0, [t5] + mova m1, [t5+r1] + mova m2, [r0] + mova m3, [r0+r1] + call chroma_inter_body_%1 + mova [t5+r1], m1 + mova [r0], m2 + CHROMA_V_LOOP 1 RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_h_chroma_mmxext, 5,7 -%ifdef ARCH_X86_64 - %define buf0 [rsp-24] - %define buf1 [rsp-16] -%else - %define buf0 r0m - %define buf1 r2m -%endif +cglobal deblock_h_chroma_%1, 5,7,8 CHROMA_H_START - TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) - movq buf0, m0 - movq buf1, m3 - call chroma_inter_body_mmxext - movq m0, buf0 - movq m3, buf1 - TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) + call chroma_inter_body_%1 + TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) + CHROMA_H_LOOP 1 RET ALIGN 16 -chroma_inter_body_mmxext: +RESET_MM_PERMUTATION +chroma_inter_body_%1: LOAD_MASK r2d, r3d movd m6, [r4] ; tc0 punpcklbw m6, m6 + punpcklbw m6, m6 pand m7, m6 DEBLOCK_P0_Q0 ret +%endmacro ; DEBLOCK_CHROMA +INIT_XMM +DEBLOCK_CHROMA sse2 +%ifndef ARCH_X86_64 +INIT_MMX +DEBLOCK_CHROMA mmxext +%endif ; in: %1=p0 %2=p1 %3=q1 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 %macro CHROMA_INTRA_P0 3 - movq m4, %1 + mova m4, %1 pxor m4, %3 pand m4, [pb_1] ; m4 = (p0^q1)&1 pavgb %1, %3 psubusb %1, m4 - pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) + pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) %endmacro %define t5 r4 %define t6 r5 +%macro DEBLOCK_CHROMA_INTRA 1 ;----------------------------------------------------------------------------- ; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_v_chroma_intra_mmxext, 4,5 +cglobal deblock_v_chroma_intra_%1, 4,5,8 CHROMA_V_START - movq m0, [t5] - movq m1, [t5+r1] - movq m2, [r0] - movq m3, [r0+r1] - call chroma_intra_body_mmxext - movq [t5+r1], m1 - movq [r0], m2 + mova m0, [t5] + mova m1, [t5+r1] + mova m2, [r0] + mova m3, [r0+r1] + call chroma_intra_body_%1 + mova [t5+r1], m1 + mova [r0], m2 + CHROMA_V_LOOP 0 RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_chroma_intra_mmxext, 4,6 +cglobal deblock_h_chroma_intra_%1, 4,6,8 CHROMA_H_START - TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) - call chroma_intra_body_mmxext - TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) + call chroma_intra_body_%1 + TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) + CHROMA_H_LOOP 0 RET ALIGN 16 -chroma_intra_body_mmxext: +RESET_MM_PERMUTATION +chroma_intra_body_%1: LOAD_MASK r2d, r3d - movq m5, m1 - movq m6, m2 + mova m5, m1 + mova m6, m2 CHROMA_INTRA_P0 m1, m0, m3 CHROMA_INTRA_P0 m2, m3, m0 psubb m1, m5 @@ -881,3 +970,11 @@ chroma_intra_body_mmxext: paddb m1, m5 paddb m2, m6 ret +%endmacro ; DEBLOCK_CHROMA_INTRA + +INIT_XMM +DEBLOCK_CHROMA_INTRA sse2 +%ifndef ARCH_X86_64 +INIT_MMX +DEBLOCK_CHROMA_INTRA mmxext +%endif diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index f9347ba..ea70e41 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -28,15 +28,19 @@ SECTION_RODATA 32 -ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0 +ch_shuf: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 +ch_shuf_adj: times 8 db 0 + times 8 db 2 + times 8 db 4 + times 8 db 6 SECTION .text -cextern pw_1 cextern pw_4 cextern pw_8 cextern pw_32 cextern pw_64 +cextern pw_00ff cextern sw_64 ;============================================================================= @@ -896,28 +900,27 @@ COPY_W16_SSE2 mc_copy_w16_aligned_sse2, movdqa ;----------------------------------------------------------------------------- %ifdef ARCH_X86_64 cglobal prefetch_fenc_mmxext, 5,5 + and r4d, 3 mov eax, r4d - and eax, 3 - imul eax, r1d - lea r0, [r0+rax*4+64] + imul r4d, r1d + lea r0, [r0+r4*4+64] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] prefetcht0 [r0] prefetcht0 [r0+r1] - and r4d, 6 - imul r4d, r3d - lea r2, [r2+r4+64] + imul eax, r3d + lea r2, [r2+rax*2+64] prefetcht0 [r2] prefetcht0 [r2+r3] RET %else -cglobal prefetch_fenc_mmxext - mov r2, [esp+20] - mov r1, [esp+8] - mov r0, [esp+4] +cglobal prefetch_fenc_mmxext, 0,3 + mov r2, r4m + mov r1, r1m + mov r0, r0m and r2, 3 imul r2, r1 lea r0, [r0+r2*4+64] @@ -927,12 +930,12 @@ cglobal prefetch_fenc_mmxext prefetcht0 [r0] prefetcht0 [r0+r1] - mov r2, [esp+20] - mov r1, [esp+16] - mov r0, [esp+12] - and r2, 6 + mov r2, r4m + mov r1, r3m + mov r0, r2m + and r2, 3 imul r2, r1 - lea r0, [r0+r2+64] + lea r0, [r0+r2*2+64] prefetcht0 [r0] prefetcht0 [r0+r1] ret @@ -963,237 +966,380 @@ cglobal prefetch_ref_mmxext, 3,3 ; chroma MC ;============================================================================= - %define t0 rax %ifdef ARCH_X86_64 - %define t1 r10 + DECLARE_REG_TMP 10,11,6 %else - %define t1 r1 + DECLARE_REG_TMP 0,1,2 %endif %macro MC_CHROMA_START 0 - movifnidn r2, r2mp - movifnidn r3d, r3m + movifnidn r3, r3mp movifnidn r4d, r4m movifnidn r5d, r5m - mov t0d, r5d - mov t1d, r4d + movifnidn t2d, r6m + mov t0d, t2d + mov t1d, r5d sar t0d, 3 sar t1d, 3 - imul t0d, r3d - add t0d, t1d + imul t0d, r4d + lea t0d, [t0+t1*2] movsxdifnidn t0, t0d - add r2, t0 ; src += (dx>>3) + (dy>>3) * src_stride + add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride +%endmacro + +%macro UNPACK_UNALIGNED_MEM 3 + punpcklwd %1, %3 +%endmacro + +%macro UNPACK_UNALIGNED_LOAD 3 + movh %2, %3 + punpcklwd %1, %2 %endmacro ;----------------------------------------------------------------------------- -; void mc_chroma( uint8_t *dst, int dst_stride, +; void mc_chroma( uint8_t *dstu, uint8_t *dstv, int dst_stride, ; uint8_t *src, int src_stride, ; int dx, int dy, ; int width, int height ) ;----------------------------------------------------------------------------- -%macro MC_CHROMA 1-2 0 -cglobal mc_chroma_%1 -%if mmsize == 16 - cmp dword r6m, 4 - jle mc_chroma_mmxext -%endif - PROLOGUE 0,6,%2 +%macro MC_CHROMA 1 +cglobal mc_chroma_%1, 0,6 MC_CHROMA_START - pxor m3, m3 - and r4d, 7 ; dx &= 7 + and r5d, 7 +%ifdef ARCH_X86_64 jz .mc1dy - and r5d, 7 ; dy &= 7 - jz .mc1dx - - movd m5, r4d - movd m6, r5d - SPLATW m5, m5 ; m5 = dx - SPLATW m6, m6 ; m6 = dy - - mova m4, [pw_8] - mova m0, m4 - psubw m4, m5 ; m4 = 8-dx - psubw m0, m6 ; m0 = 8-dy - - mova m7, m5 - pmullw m5, m0 ; m5 = dx*(8-dy) = cB - pmullw m7, m6 ; m7 = dx*dy = cD - pmullw m6, m4 ; m6 = (8-dx)*dy = cC - pmullw m4, m0 ; m4 = (8-dx)*(8-dy) = cA - - mov r4d, r7m +%endif + and t2d, 7 %ifdef ARCH_X86_64 - mov r10, r0 - mov r11, r2 + jz .mc1dx +%endif + shl r5d, 16 + add t2d, r5d + mov t0d, t2d + shl t2d, 8 + sub t2d, t0d + add t2d, 0x80008 ; (x<<24) + ((8-x)<<16) + (y<<8) + (8-y) + cmp dword r7m, 4 +%if mmsize==8 +.skip_prologue: %else - mov r0, r0mp - mov r1, r1m - mov r5, r2 + jl mc_chroma_mmxext %+ .skip_prologue + WIN64_SPILL_XMM 9 %endif - -.loop2d: - movh m1, [r2+r3] - movh m0, [r2] - punpcklbw m1, m3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4 - punpcklbw m0, m3 - pmullw m1, m6 ; 2nd line * cC - pmullw m0, m4 ; 1st line * cA - paddw m0, m1 ; m0 <- result - - movh m2, [r2+1] - movh m1, [r2+r3+1] - punpcklbw m2, m3 - punpcklbw m1, m3 - - paddw m0, [pw_32] - - pmullw m2, m5 ; line * cB - pmullw m1, m7 ; line * cD + movd m5, t2d + movifnidn r0, r0mp + movifnidn r1, r1mp + movifnidn r2d, r2m + movifnidn r5d, r8m + pxor m6, m6 + punpcklbw m5, m6 +%if mmsize==8 + pshufw m7, m5, 0xee + pshufw m6, m5, 0x00 + pshufw m5, m5, 0x55 + jge .width4 +%else +%ifdef WIN64 + cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM +%endif + pshufd m7, m5, 0x55 + punpcklwd m5, m5 + pshufd m6, m5, 0x00 + pshufd m5, m5, 0x55 + jg .width8 +%endif + movu m0, [r3] + UNPACK_UNALIGNED m0, m1, [r3+2] + mova m1, m0 + pand m0, [pw_00ff] + psrlw m1, 8 + pmaddwd m0, m7 + pmaddwd m1, m7 + packssdw m0, m1 + SWAP m3, m0 +ALIGN 4 +.loop2: + movu m0, [r3+r4] + UNPACK_UNALIGNED m0, m1, [r3+r4+2] + pmullw m3, m6 + mova m1, m0 + pand m0, [pw_00ff] + psrlw m1, 8 + pmaddwd m0, m7 + pmaddwd m1, m7 + mova m2, [pw_32] + packssdw m0, m1 + paddw m2, m3 + mova m3, m0 + pmullw m0, m5 paddw m0, m2 - paddw m0, m1 psrlw m0, 6 + packuswb m0, m0 + movd [r0], m0 +%if mmsize==8 + psrlq m0, 16 +%else + psrldq m0, 4 +%endif + movd [r1], m0 + add r3, r4 + add r0, r2 + add r1, r2 + dec r5d + jg .loop2 + REP_RET - packuswb m0, m3 ; 00 00 00 00 px1 px2 px3 px4 - movh [r0], m0 - - add r2, r3 - add r0, r1 ; dst_stride - dec r4d - jnz .loop2d - -%if mmsize == 8 - sub dword r6m, 8 - jnz .finish ; width != 8 so assume 4 +%if mmsize==8 +.width4: %ifdef ARCH_X86_64 - lea r0, [r10+4] ; dst - lea r2, [r11+4] ; src + mov t0, r0 + mov t1, r1 + mov t2, r3 + %define multy0 [rsp-8] + mova multy0, m5 %else - mov r0, r0mp - lea r2, [r5+4] - add r0, 4 + mov r3m, r3 + %define multy0 r4m + mova multy0, m5 %endif - mov r4d, r7m ; height - jmp .loop2d %else +.width8: +%ifdef ARCH_X86_64 + %define multy0 m8 + SWAP m8, m5 +%else + %define multy0 r0m + mova multy0, m5 +%endif +%endif +.loopx: + movu m0, [r3] + movu m1, [r3+mmsize/2] + UNPACK_UNALIGNED m0, m2, [r3+2] + UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2] + mova m2, m0 + mova m3, m1 + pand m0, [pw_00ff] + pand m1, [pw_00ff] + psrlw m2, 8 + psrlw m3, 8 + pmaddwd m0, m7 + pmaddwd m2, m7 + pmaddwd m1, m7 + pmaddwd m3, m7 + packssdw m0, m2 + packssdw m1, m3 + SWAP m4, m0 + SWAP m5, m1 + add r3, r4 +ALIGN 4 +.loop4: + movu m0, [r3] + movu m1, [r3+mmsize/2] + UNPACK_UNALIGNED m0, m2, [r3+2] + UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2] + mova m2, m0 + mova m3, m1 + pand m0, [pw_00ff] + pand m1, [pw_00ff] + psrlw m2, 8 + psrlw m3, 8 + pmaddwd m0, m7 + pmaddwd m2, m7 + pmaddwd m1, m7 + pmaddwd m3, m7 + packssdw m0, m2 + packssdw m1, m3 + pmullw m4, m6 + pmullw m5, m6 + mova m2, [pw_32] + mova m3, m2 + paddw m2, m4 + paddw m3, m5 + mova m4, m0 + mova m5, m1 + pmullw m0, multy0 + pmullw m1, multy0 + paddw m0, m2 + paddw m1, m3 + psrlw m0, 6 + psrlw m1, 6 + packuswb m0, m1 +%if mmsize==8 + pshufw m1, m0, 0x8 + pshufw m0, m0, 0xd + movd [r0], m1 + movd [r1], m0 +%else + pshufd m0, m0, 0xd8 + movq [r0], m0 + movhps [r1], m0 +%endif + add r3, r4 + add r0, r2 + add r1, r2 + dec r5d + jg .loop4 +%if mmsize!=8 + REP_RET +%else + sub dword r7m, 4 + jg .width8 REP_RET -%endif ; mmsize +.width8: +%ifdef ARCH_X86_64 + lea r3, [t2+8] + lea r0, [t0+4] + lea r1, [t1+4] +%else + mov r3, r3m + mov r0, r0m + mov r1, r1m + add r3, 8 + add r0, 4 + add r1, 4 +%endif + mov r5d, r8m + jmp .loopx +%endif +%ifdef ARCH_X86_64 ; too many regs for x86_32 + RESET_MM_PERMUTATION +%ifdef WIN64 +%if xmm_regs_used > 6 + %assign stack_offset stack_offset-(xmm_regs_used-6)*16-16 + %assign xmm_regs_used 6 +%endif +%endif .mc1dy: - and r5d, 7 - movd m6, r5d - mov r5, r3 ; pel_offset = dx ? 1 : src_stride + and t2d, 7 + movd m5, t2d + mov r6d, r4d ; pel_offset = dx ? 2 : src_stride jmp .mc1d .mc1dx: - movd m6, r4d - mov r5d, 1 + movd m5, r5d + mov r6d, 2 .mc1d: - mova m5, [pw_8] - SPLATW m6, m6 - mova m7, [pw_4] - psubw m5, m6 - movifnidn r0, r0mp - movifnidn r1d, r1m - mov r4d, r7m -%if mmsize == 8 - cmp dword r6m, 8 - je .loop1d_w8 + mova m4, [pw_8] + SPLATW m5, m5 + psubw m4, m5 + movifnidn r0, r0mp + movifnidn r1, r1mp + movifnidn r2d, r2m + movifnidn r5d, r8m + cmp dword r7m, 4 + jg .mc1d_w8 + mov r10, r2 + mov r11, r4 +%if mmsize!=8 + shr r5d, 1 %endif - .loop1d_w4: - movh m0, [r2+r5] - movh m1, [r2] - punpcklbw m0, m3 - punpcklbw m1, m3 - pmullw m0, m6 - pmullw m1, m5 - paddw m0, m7 - paddw m0, m1 - psrlw m0, 3 - packuswb m0, m3 - movh [r0], m0 - add r2, r3 - add r0, r1 - dec r4d - jnz .loop1d_w4 -.finish: - REP_RET - -%if mmsize == 8 -.loop1d_w8: - movu m0, [r2+r5] - mova m1, [r2] + movq m0, [r3] + movq m1, [r3+r6] +%if mmsize!=8 + add r3, r11 + movhps m0, [r3] + movhps m1, [r3+r6] +%endif mova m2, m0 - mova m4, m1 - punpcklbw m0, m3 - punpcklbw m1, m3 - punpckhbw m2, m3 - punpckhbw m4, m3 - pmullw m0, m6 + mova m3, m1 + pand m0, [pw_00ff] + pand m1, [pw_00ff] + psrlw m2, 8 + psrlw m3, 8 + pmullw m0, m4 pmullw m1, m5 - pmullw m2, m6 - pmullw m4, m5 - paddw m0, m7 - paddw m2, m7 + pmullw m2, m4 + pmullw m3, m5 + paddw m0, [pw_4] + paddw m2, [pw_4] paddw m0, m1 - paddw m2, m4 + paddw m2, m3 psrlw m0, 3 psrlw m2, 3 packuswb m0, m2 - mova [r0], m0 - add r2, r3 - add r0, r1 - dec r4d - jnz .loop1d_w8 +%if mmsize==8 + xchg r4, r11 + xchg r2, r10 + movd [r0], m0 + psrlq m0, 32 + movd [r1], m0 +%else + movhlps m1, m0 + movd [r0], m0 + movd [r1], m1 + add r0, r10 + add r1, r10 + psrldq m0, 4 + psrldq m1, 4 + movd [r0], m0 + movd [r1], m1 +%endif + add r3, r4 + add r0, r2 + add r1, r2 + dec r5d + jg .loop1d_w4 REP_RET -%endif ; mmsize +.mc1d_w8: + sub r2, 4 + sub r4, 8 + mov r10, 4 + mov r11, 8 +%if mmsize==8 + shl r5d, 1 +%endif + jmp .loop1d_w4 +%endif ; ARCH_X86_64 %endmacro ; MC_CHROMA -INIT_MMX -MC_CHROMA mmxext -INIT_XMM -MC_CHROMA sse2, 8 -%macro MC_CHROMA_SSSE3 2 -INIT_MMX -cglobal mc_chroma_ssse3%1, 0,6,%2 +%macro MC_CHROMA_SSSE3 0-1 +INIT_XMM +cglobal mc_chroma_ssse3%1, 0,6,9 MC_CHROMA_START - and r4d, 7 and r5d, 7 - mov t0d, r4d + and t2d, 7 + mov t0d, r5d shl t0d, 8 - sub t0d, r4d - mov r4d, 8 + sub t0d, r5d + mov r5d, 8 add t0d, 8 - sub r4d, r5d - imul r5d, t0d ; (x*255+8)*y - imul r4d, t0d ; (x*255+8)*(8-y) - cmp dword r6m, 4 - jg .width8 - mova m5, [pw_32] - movd m6, r5d - movd m7, r4d - movifnidn r0, r0mp - movifnidn r1d, r1m - movifnidn r4d, r7m - SPLATW m6, m6 - SPLATW m7, m7 - mov r5, r2 - and r2, ~3 - and r5, 3 + sub r5d, t2d + imul t2d, t0d ; (x*255+8)*y + imul r5d, t0d ; (x*255+8)*(8-y) + movd m6, t2d + movd m7, r5d +%ifidn %1, _cache64 + mov t0d, r3d + and t0d, 7 %ifdef PIC - lea r11, [ch_shuffle] - movu m5, [r11 + r5*2] + lea t1, [ch_shuf_adj] + movddup m5, [t1 + t0*4] %else - movu m5, [ch_shuffle + r5*2] + movddup m5, [ch_shuf_adj + t0*4] %endif - movu m0, [r2] + paddb m5, [ch_shuf] + and r3, ~7 +%else + mova m5, [ch_shuf] +%endif + movifnidn r0, r0mp + movifnidn r1, r1mp + movifnidn r2d, r2m + movifnidn r5d, r8m + SPLATW m6, m6 + SPLATW m7, m7 + cmp dword r7m, 4 + jg .width8 + movu m0, [r3] pshufb m0, m5 .loop4: - movu m1, [r2+r3] + movu m1, [r3+r4] pshufb m1, m5 - movu m3, [r2+2*r3] + movu m3, [r3+r4*2] pshufb m3, m5 - lea r2, [r2+2*r3] mova m2, m1 mova m4, m3 pmaddubsw m0, m7 @@ -1207,109 +1353,90 @@ cglobal mc_chroma_ssse3%1, 0,6,%2 mova m0, m4 psrlw m1, 6 psrlw m3, 6 - packuswb m1, m1 - packuswb m3, m3 - movh [r0], m1 - movh [r0+r1], m3 - sub r4d, 2 - lea r0, [r0+2*r1] + packuswb m1, m3 + movhlps m3, m1 + movd [r0], m1 + movd [r0+r2], m3 + psrldq m1, 4 + psrldq m3, 4 + movd [r1], m1 + movd [r1+r2], m3 + lea r3, [r3+r4*2] + lea r0, [r0+r2*2] + lea r1, [r1+r2*2] + sub r5d, 2 jg .loop4 REP_RET -INIT_XMM .width8: - movd m6, r5d - movd m7, r4d - movifnidn r0, r0mp - movifnidn r1d, r1m - movifnidn r4d, r7m - SPLATW m6, m6 - SPLATW m7, m7 -%ifidn %1, _cache64 - mov r5, r2 - and r5, 0x3f - cmp r5, 0x38 - jge .split -%endif - mova m5, [pw_32] - movh m0, [r2] - movh m1, [r2+1] - punpcklbw m0, m1 -.loop8: - movh m1, [r2+1*r3] - movh m2, [r2+1*r3+1] - movh m3, [r2+2*r3] - movh m4, [r2+2*r3+1] - punpcklbw m1, m2 - punpcklbw m3, m4 - lea r2, [r2+2*r3] - mova m2, m1 - mova m4, m3 - pmaddubsw m0, m7 - pmaddubsw m1, m6 - pmaddubsw m2, m7 - pmaddubsw m3, m6 - paddw m0, m5 - paddw m2, m5 - paddw m1, m0 - paddw m3, m2 - mova m0, m4 - psrlw m1, 6 - psrlw m3, 6 - packuswb m1, m3 - movh [r0], m1 - movhps [r0+r1], m1 - sub r4d, 2 - lea r0, [r0+2*r1] - jg .loop8 - REP_RET -%ifidn %1, _cache64 -.split: - and r2, ~7 - and r5, 7 -%ifdef PIC - lea r11, [ch_shuffle] - movu m5, [r11 + r5*2] -%else - movu m5, [ch_shuffle + r5*2] -%endif - movu m0, [r2] + movu m0, [r3] pshufb m0, m5 + movu m1, [r3+8] + pshufb m1, m5 %ifdef ARCH_X86_64 - mova m8, [pw_32] - %define round m8 + SWAP m8, m6 + %define mult1 m8 %else - %define round [pw_32] + mova r0m, m6 + %define mult1 r0m %endif -.splitloop8: - movu m1, [r2+r3] - pshufb m1, m5 - movu m3, [r2+2*r3] +.loop8: + movu m2, [r3+r4] + pshufb m2, m5 + movu m3, [r3+r4+8] pshufb m3, m5 - lea r2, [r2+2*r3] - mova m2, m1 - mova m4, m3 + mova m4, m2 + mova m6, m3 pmaddubsw m0, m7 - pmaddubsw m1, m6 - pmaddubsw m2, m7 - pmaddubsw m3, m6 - paddw m0, round - paddw m2, round - paddw m1, m0 - paddw m3, m2 - mova m0, m4 + pmaddubsw m1, m7 + pmaddubsw m2, mult1 + pmaddubsw m3, mult1 + paddw m0, [pw_32] + paddw m1, [pw_32] + paddw m0, m2 + paddw m1, m3 + psrlw m0, 6 psrlw m1, 6 + packuswb m0, m1 + pshufd m0, m0, 0xd8 + movq [r0], m0 + movhps [r1], m0 + + movu m2, [r3+r4*2] + pshufb m2, m5 + movu m3, [r3+r4*2+8] + pshufb m3, m5 + mova m0, m2 + mova m1, m3 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pmaddubsw m2, mult1 + pmaddubsw m3, mult1 + paddw m4, [pw_32] + paddw m6, [pw_32] + paddw m2, m4 + paddw m3, m6 + psrlw m2, 6 psrlw m3, 6 - packuswb m1, m3 - movh [r0], m1 - movhps [r0+r1], m1 - sub r4d, 2 - lea r0, [r0+2*r1] - jg .splitloop8 + packuswb m2, m3 + pshufd m2, m2, 0xd8 + movq [r0+r2], m2 + movhps [r1+r2], m2 + lea r3, [r3+r4*2] + lea r0, [r0+r2*2] + lea r1, [r1+r2*2] + sub r5d, 2 + jg .loop8 REP_RET -%endif -; mc_chroma 1d ssse3 is negligibly faster, and definitely not worth the extra code size %endmacro -MC_CHROMA_SSSE3 , 8 -MC_CHROMA_SSSE3 _cache64, 9 +INIT_MMX +%define UNPACK_UNALIGNED UNPACK_UNALIGNED_MEM +MC_CHROMA mmxext +INIT_XMM +MC_CHROMA sse2_misalign +%define UNPACK_UNALIGNED UNPACK_UNALIGNED_LOAD +MC_CHROMA sse2 +MC_CHROMA_SSSE3 +MC_CHROMA_SSSE3 _cache64 + diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index 8deb9e0..ad4b33a 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -31,15 +31,17 @@ SECTION_RODATA filt_mul20: times 16 db 20 filt_mul15: times 8 db 1, -5 filt_mul51: times 8 db -5, 1 -hpel_shuf: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 +deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 SECTION .text cextern pw_1 cextern pw_16 cextern pw_32 -cextern pd_128 +cextern pw_00ff cextern pw_3fff +cextern pd_128 %macro LOAD_ADD 4 movh %4, %3 @@ -171,7 +173,7 @@ cglobal hpel_filter_v_%1, 5,6,%2 mova [r2+r4*2], m1 mova [r2+r4*2+mmsize], m4 FILT_PACK m1, m4, 5, m7 - movnt [r0+r4], m1 + movnta [r0+r4], m1 add r1, mmsize add r5, mmsize add r4, mmsize @@ -689,6 +691,213 @@ cglobal plane_copy_core_mmxext, 6,7 RET +%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint + movq m0, [%2] +%if mmsize==16 +%if %4 + punpcklbw m0, [%3] +%else + movq m1, [%3] + punpcklbw m0, m1 +%endif + mov%5a [%1], m0 +%else + movq m1, [%3] + mova m2, m0 + punpcklbw m0, m1 + punpckhbw m2, m1 + mov%5a [%1], m0 + mov%5a [%1+8], m2 +%endif +%endmacro + +%macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, cpu, shuffle constant +%if mmsize==16 + mova m0, [%3] +%ifidn %5, ssse3 + pshufb m0, %6 +%else + mova m1, m0 + pand m0, %6 + psrlw m1, 8 + packuswb m0, m1 +%endif +%if %4 + mova [%1], m0 +%else + movq [%1], m0 + movhps [%2], m0 +%endif +%else + mova m0, [%3] + mova m1, [%3+8] + mova m2, m0 + mova m3, m1 + pand m0, %6 + pand m1, %6 + psrlw m2, 8 + psrlw m3, 8 + packuswb m0, m1 + packuswb m2, m3 + mova [%1], m0 + mova [%2], m2 +%endif +%endmacro + +%macro PLANE_INTERLEAVE 1 +;----------------------------------------------------------------------------- +; void plane_copy_interleave_core( uint8_t *dst, int i_dst, +; uint8_t *srcu, int i_srcu, +; uint8_t *srcv, int i_srcv, int w, int h ) +;----------------------------------------------------------------------------- +; assumes i_dst and w are multiples of 16, and i_dst>2*w +cglobal plane_copy_interleave_core_%1, 6,7 + mov r6d, r6m + movsxdifnidn r1, r1d + movsxdifnidn r3, r3d + movsxdifnidn r5, r5d + lea r0, [r0+r6*2] + add r2, r6 + add r4, r6 +%ifdef ARCH_X86_64 + DECLARE_REG_TMP 10,11 +%else + DECLARE_REG_TMP 1,3 +%endif + mov t0d, r7m + mov t1d, r1d + shr t1d, 1 + sub t1d, r6d +.loopy: + mov r6d, r6m + neg r6 +.prefetch: + prefetchnta [r2+r6] + prefetchnta [r4+r6] + add r6, 64 + jl .prefetch + mov r6d, r6m + neg r6 +.loopx: + INTERLEAVE r0+r6*2, r2+r6, r4+r6, 0, nt + INTERLEAVE r0+r6*2+16, r2+r6+8, r4+r6+8, 0, nt + add r6, 16 + jl .loopx +.pad: +%if mmsize==8 + movntq [r0+r6*2], m0 + movntq [r0+r6*2+8], m0 + movntq [r0+r6*2+16], m0 + movntq [r0+r6*2+24], m0 +%else + movntdq [r0+r6*2], m0 + movntdq [r0+r6*2+16], m0 +%endif + add r6, 16 + cmp r6, t1 + jl .pad + add r0, r1mp + add r2, r3mp + add r4, r5 + dec t0d + jg .loopy + sfence + emms + RET + +;----------------------------------------------------------------------------- +; void store_interleave_8x8x2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv ) +;----------------------------------------------------------------------------- +cglobal store_interleave_8x8x2_%1, 4,5 + mov r4d, 4 +.loop: + INTERLEAVE r0, r2, r3, 1 + INTERLEAVE r0+r1, r2+FDEC_STRIDE, r3+FDEC_STRIDE, 1 + add r2, FDEC_STRIDE*2 + add r3, FDEC_STRIDE*2 + lea r0, [r0+r1*2] + dec r4d + jg .loop + REP_RET +%endmacro ; PLANE_INTERLEAVE + +%macro DEINTERLEAVE_START 1 +%ifidn %1, ssse3 + mova m4, [deinterleave_shuf] +%else + mova m4, [pw_00ff] +%endif +%endmacro + +%macro PLANE_DEINTERLEAVE 1 +;----------------------------------------------------------------------------- +; void plane_copy_deinterleave( uint8_t *dstu, int i_dstu, +; uint8_t *dstv, int i_dstv, +; uint8_t *src, int i_src, int w, int h ) +;----------------------------------------------------------------------------- +cglobal plane_copy_deinterleave_%1, 6,7 + DEINTERLEAVE_START %1 + mov r6d, r6m + movsxdifnidn r1, r1d + movsxdifnidn r3, r3d + movsxdifnidn r5, r5d + add r0, r6 + add r2, r6 + lea r4, [r4+r6*2] +.loopy: + mov r6d, r6m + neg r6 +.loopx: + DEINTERLEAVE r0+r6, r2+r6, r4+r6*2, 0, %1, m4 + DEINTERLEAVE r0+r6+8, r2+r6+8, r4+r6*2+16, 0, %1, m4 + add r6, 16 + jl .loopx + add r0, r1 + add r2, r3 + add r4, r5 + dec dword r7m + jg .loopy + REP_RET + +;----------------------------------------------------------------------------- +; void load_deinterleave_8x8x2_fenc( uint8_t *dst, uint8_t *src, int i_src ) +;----------------------------------------------------------------------------- +cglobal load_deinterleave_8x8x2_fenc_%1, 3,4 + DEINTERLEAVE_START %1 + mov r3d, 4 +.loop: + DEINTERLEAVE r0, r0+FENC_STRIDE/2, r1, 1, %1, m4 + DEINTERLEAVE r0+FENC_STRIDE, r0+FENC_STRIDE*3/2, r1+r2, 1, %1, m4 + add r0, FENC_STRIDE*2 + lea r1, [r1+r2*2] + dec r3d + jg .loop + REP_RET + +;----------------------------------------------------------------------------- +; void load_deinterleave_8x8x2_fdec( uint8_t *dst, uint8_t *src, int i_src ) +;----------------------------------------------------------------------------- +cglobal load_deinterleave_8x8x2_fdec_%1, 3,4 + DEINTERLEAVE_START %1 + mov r3d, 4 +.loop: + DEINTERLEAVE r0, r0+FDEC_STRIDE/2, r1, 0, %1, m4 + DEINTERLEAVE r0+FDEC_STRIDE, r0+FDEC_STRIDE*3/2, r1+r2, 0, %1, m4 + add r0, FDEC_STRIDE*2 + lea r1, [r1+r2*2] + dec r3d + jg .loop + REP_RET +%endmacro ; PLANE_DEINTERLEAVE + +INIT_MMX +PLANE_INTERLEAVE mmxext +PLANE_DEINTERLEAVE mmx +INIT_XMM +PLANE_INTERLEAVE sse2 +PLANE_DEINTERLEAVE sse2 +PLANE_DEINTERLEAVE ssse3 + ; These functions are not general-use; not only do the SSE ones require aligned input, ; but they also will fail if given a non-mod16 size or a size less than 64. diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index fb73562..502319c 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -76,20 +76,34 @@ void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int ); void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int ); void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int ); void x264_prefetch_ref_mmxext( uint8_t *, int, int ); -void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); -void x264_mc_chroma_sse2( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); -void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); -void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); void x264_plane_copy_core_mmxext( uint8_t *, int, uint8_t *, int, int w, int h); -void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h); +void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h ); +void x264_plane_copy_interleave_core_mmxext( uint8_t *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ); +void x264_plane_copy_interleave_core_sse2( uint8_t *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ); +void x264_plane_copy_interleave_c( uint8_t *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ); +void x264_plane_copy_deinterleave_mmx( uint8_t *dstu, int i_dstu, + uint8_t *dstv, int i_dstv, + uint8_t *src, int i_src, int w, int h ); +void x264_plane_copy_deinterleave_sse2( uint8_t *dstu, int i_dstu, + uint8_t *dstv, int i_dstv, + uint8_t *src, int i_src, int w, int h ); +void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, int i_dstu, + uint8_t *dstv, int i_dstv, + uint8_t *src, int i_src, int w, int h ); +void x264_store_interleave_8x8x2_mmxext( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv ); +void x264_store_interleave_8x8x2_sse2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv ); +void x264_load_deinterleave_8x8x2_fenc_mmx( uint8_t *dst, uint8_t *src, int i_src ); +void x264_load_deinterleave_8x8x2_fenc_sse2( uint8_t *dst, uint8_t *src, int i_src ); +void x264_load_deinterleave_8x8x2_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src ); +void x264_load_deinterleave_8x8x2_fdec_mmx( uint8_t *dst, uint8_t *src, int i_src ); +void x264_load_deinterleave_8x8x2_fdec_sse2( uint8_t *dst, uint8_t *src, int i_src ); +void x264_load_deinterleave_8x8x2_fdec_ssse3( uint8_t *dst, uint8_t *src, int i_src ); void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n ); void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n ); void x264_memzero_aligned_mmx( void * dst, int n ); @@ -103,6 +117,17 @@ void x264_integral_init8v_sse2( uint16_t *sum8, int stride ); void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride ); void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, int len ); + +#define MC_CHROMA(cpu) \ +void x264_mc_chroma_##cpu( uint8_t *dstu, uint8_t *dstv, int i_dst,\ + uint8_t *src, int i_src,\ + int dx, int dy, int i_width, int i_height ); +MC_CHROMA(mmxext) +MC_CHROMA(sse2) +MC_CHROMA(sse2_misalign) +MC_CHROMA(ssse3) +MC_CHROMA(ssse3_cache64) + #define LOWRES(cpu) \ void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\ int src_stride, int dst_stride, int width, int height ); @@ -342,11 +367,13 @@ HPEL(16, ssse3, ssse3, ssse3, ssse3) #endif HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2) -static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h) +static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h ) { if( w < 256 ) { // tiny resolutions don't want non-temporal hints. dunno the exact threshold. x264_plane_copy_c( dst, i_dst, src, i_src, w, h ); - } else if(i_src > 0) { + } else if( !(w&15) ) { + x264_plane_copy_core_mmxext( dst, i_dst, src, i_src, w, h ); + } else if( i_src > 0 ) { // have to use plain memcpy on the last line (in memory order) to avoid overreading src x264_plane_copy_core_mmxext( dst, i_dst, src, i_src, (w+15)&~15, h-1 ); memcpy( dst+i_dst*(h-1), src+i_src*(h-1), w ); @@ -356,6 +383,27 @@ static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i } } +#define PLANE_INTERLEAVE(cpu) \ +static void x264_plane_copy_interleave_##cpu( uint8_t *dst, int i_dst,\ + uint8_t *srcu, int i_srcu,\ + uint8_t *srcv, int i_srcv, int w, int h )\ +{\ + if( !(w&15) ) {\ + x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ + } else if( w < 16 || (i_srcu ^ i_srcv) ) {\ + x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ + } else if( i_srcu > 0 ) {\ + x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+15)&~15, h-1 );\ + x264_plane_copy_interleave_c( dst+i_dst*(h-1), 0, srcu+i_srcu*(h-1), 0, srcv+i_srcv*(h-1), 0, w, 1 );\ + } else {\ + x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\ + x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+15)&~15, h-1 );\ + }\ +} + +PLANE_INTERLEAVE(mmxext) +PLANE_INTERLEAVE(sse2) + void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) { if( !(cpu&X264_CPU_MMX) ) @@ -391,7 +439,14 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmxext; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmxext; + pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmxext; + pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_mmx; + pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_mmx; + pf->plane_copy = x264_plane_copy_mmxext; + pf->plane_copy_interleave = x264_plane_copy_interleave_mmxext; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_mmx; + pf->hpel_filter = x264_hpel_filter_mmxext; pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext; @@ -440,10 +495,16 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) if( cpu&X264_CPU_SSE_MISALIGN ) pf->hpel_filter = x264_hpel_filter_sse2_misalign; pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2; - pf->mc_chroma = x264_mc_chroma_sse2; + if( !(cpu&X264_CPU_STACK_MOD4) ) + pf->mc_chroma = x264_mc_chroma_sse2; if( cpu&X264_CPU_SSE2_IS_FAST ) { + pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_sse2; // FIXME sse2fast? sse2medium? + pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_sse2; + pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_sse2; + pf->plane_copy_interleave = x264_plane_copy_interleave_sse2; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2; pf->mc_luma = mc_luma_sse2; pf->get_ref = get_ref_sse2; if( cpu&X264_CPU_CACHELINE_64 ) @@ -452,7 +513,10 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->get_ref = get_ref_cache64_sse2; } if( cpu&X264_CPU_SSE_MISALIGN ) + { pf->get_ref = get_ref_sse2_misalign; + pf->mc_chroma = x264_mc_chroma_sse2_misalign; + } } if( !(cpu&X264_CPU_SSSE3) ) @@ -467,12 +531,19 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3; + pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_ssse3; + pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_ssse3; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3; + pf->hpel_filter = x264_hpel_filter_ssse3; pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; - pf->mc_chroma = x264_mc_chroma_ssse3; + if( !(cpu&X264_CPU_STACK_MOD4) ) + pf->mc_chroma = x264_mc_chroma_ssse3; + if( cpu&X264_CPU_CACHELINE_64 ) { - pf->mc_chroma = x264_mc_chroma_ssse3_cache64; + if( !(cpu&X264_CPU_STACK_MOD4) ) + pf->mc_chroma = x264_mc_chroma_ssse3_cache64; pf->mc_luma = mc_luma_cache64_ssse3; pf->get_ref = get_ref_cache64_ssse3; diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 78ca4c7..1788186 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -40,6 +40,7 @@ hmul_8p: times 8 db 1 times 4 db 1, -1 mask_10: times 4 dw 0, -1 mask_1100: times 2 dd 0, -1 +deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 SECTION .text @@ -304,6 +305,55 @@ SSD 4, 4, ssse3 SSD 4, 8, ssse3 %assign function_align 16 +;----------------------------------------------------------------------------- +; uint64_t pixel_ssd_nv12_core( uint8_t *pixuv1, int stride1, uint8_t *pixuv2, int stride2, int width, int height ) +;----------------------------------------------------------------------------- +%macro SSD_NV12 1-2 0 +cglobal pixel_ssd_nv12_core_%1, 6,7 + shl r4d, 1 + add r0, r4 + add r2, r4 + pxor m3, m3 + pxor m4, m4 + mova m5, [pw_00ff] +.loopy: + mov r6, r4 + neg r6 +.loopx: + mova m0, [r0+r6] + mova m1, [r2+r6] + psubusb m0, m1 + psubusb m1, [r0+r6] + por m0, m1 + mova m2, m0 + pand m0, m5 + psrlw m2, 8 + pmaddwd m0, m0 + pmaddwd m2, m2 + paddd m3, m0 + paddd m4, m2 + add r6, mmsize + jl .loopx + add r0, r1 + add r2, r3 + dec r5d + jg .loopy + HADDD m3, m0 + HADDD m4, m0 + movd eax, m3 + movd edx, m4 +%ifdef ARCH_X86_64 + shl rdx, 32 + add rax, rdx +%endif + RET +%endmacro ; SSD_NV12 + +INIT_MMX +SSD_NV12 mmxext +INIT_XMM +SSD_NV12 sse2 + ;============================================================================= ; variance ;============================================================================= @@ -2168,9 +2218,7 @@ cglobal pixel_ssim_end4_sse2, 3,3,7 add t0, 4*%1 sub r0d, 4*%1 jg .loop -%ifdef WIN64 - RESTORE_XMM r10 -%endif + WIN64_RESTORE_XMM r10 jmp ads_mvs %endmacro diff --git a/common/x86/pixel.h b/common/x86/pixel.h index 9bba683..0339564 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -97,6 +97,10 @@ void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * ); void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * ); void x264_intra_sa8d_x3_8x8_core_ssse3 ( uint8_t *, int16_t [2][8], int * ); +uint64_t x264_pixel_ssd_nv12_core_mmxext( uint8_t *pixuv1, int stride1, + uint8_t *pixuv2, int stride2, int width, int height ); +uint64_t x264_pixel_ssd_nv12_core_sse2( uint8_t *pixuv1, int stride1, + uint8_t *pixuv2, int stride2, int width, int height ); void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1, const uint8_t *pix2, int stride2, int sums[2][4] ); void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index 9d23640..5d486bc 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -271,13 +271,21 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] ASSERT %2 >= %1 %assign regs_used %2 ASSERT regs_used <= 7 - %assign xmm_regs_used %3 - ASSERT xmm_regs_used <= 16 %if regs_used > 4 push r4 push r5 %assign stack_offset stack_offset+16 %endif + WIN64_SPILL_XMM %3 + LOAD_IF_USED 4, %1 + LOAD_IF_USED 5, %1 + LOAD_IF_USED 6, %1 + DEFINE_ARGS %4 +%endmacro + +%macro WIN64_SPILL_XMM 1 + %assign xmm_regs_used %1 + ASSERT xmm_regs_used <= 16 %if xmm_regs_used > 6 sub rsp, (xmm_regs_used-6)*16+16 %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16 @@ -287,13 +295,9 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i %endrep %endif - LOAD_IF_USED 4, %1 - LOAD_IF_USED 5, %1 - LOAD_IF_USED 6, %1 - DEFINE_ARGS %4 %endmacro -%macro RESTORE_XMM_INTERNAL 1 +%macro WIN64_RESTORE_XMM_INTERNAL 1 %if xmm_regs_used > 6 %assign %%i xmm_regs_used %rep (xmm_regs_used-6) @@ -304,14 +308,14 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] %endif %endmacro -%macro RESTORE_XMM 1 - RESTORE_XMM_INTERNAL %1 +%macro WIN64_RESTORE_XMM 1 + WIN64_RESTORE_XMM_INTERNAL %1 %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 %assign xmm_regs_used 0 %endmacro %macro RET 0 - RESTORE_XMM_INTERNAL rsp + WIN64_RESTORE_XMM_INTERNAL rsp %if regs_used > 4 pop r5 pop r4 @@ -428,6 +432,13 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] %endif ;====================================================================== +%ifndef WIN64 +%macro WIN64_SPILL_XMM 1 +%endmacro +%macro WIN64_RESTORE_XMM 1 +%endmacro +%endif + ;============================================================================= @@ -494,7 +505,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %define mova movq %define movu movq %define movh movd - %define movnt movntq + %define movnta movntq %assign %%i 0 %rep 8 CAT_XDEFINE m, %%i, mm %+ %%i @@ -518,7 +529,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %define mova movdqa %define movu movdqu %define movh movq - %define movnt movntdq + %define movnta movntdq %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, xmm %+ %%i diff --git a/encoder/analyse.c b/encoder/analyse.c index 8868012..92edfcd 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -1091,8 +1091,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \ - (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \ - (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \ + (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>1)*(m)->i_stride[1]]; \ (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->weight = weight_none; \ (m)->i_ref = ref; @@ -1473,11 +1472,11 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; x264_weight_t *weight = h->sh.weight[i_ref]; + // FIXME weight can be done on 4x4 blocks even if mc is smaller #define CHROMA4x4MC( width, height, me, x, y ) \ - h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \ + h->mc.mc_chroma( &pix1[x+y*16], &pix2[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \ if( weight[1].weightfn ) \ weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \ - h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \ if( weight[2].weightfn ) \ weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height ); diff --git a/encoder/encoder.c b/encoder/encoder.c index 7ad4295..a73322d 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -65,9 +65,15 @@ static void x264_frame_dump( x264_t *h ) return; /* Write the frame in display order */ fseek( f, (uint64_t)h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2, SEEK_SET ); - for( int i = 0; i < h->fdec->i_plane; i++ ) - for( int y = 0; y < h->param.i_height >> !!i; y++ ) - fwrite( &h->fdec->plane[i][y*h->fdec->i_stride[i]], 1, h->param.i_width >> !!i, f ); + for( int y = 0; y < h->param.i_height; y++ ) + fwrite( &h->fdec->plane[0][y*h->fdec->i_stride[0]], 1, h->param.i_width, f ); + int cw = h->param.i_width>>1; + int ch = h->param.i_height>>1; + uint8_t *planeu = x264_malloc( cw*ch*2 ); + uint8_t *planev = planeu + cw*ch; + h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch ); + fwrite( planeu, 1, cw*ch*2, f ); + x264_free( planeu ); fclose( f ); } @@ -382,9 +388,9 @@ static int x264_validate_parameters( x264_t *h ) return -1; } int i_csp = h->param.i_csp & X264_CSP_MASK; - if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 ) + if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 && i_csp != X264_CSP_NV12 ) { - x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12 supported)\n" ); + x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12 supported)\n" ); return -1; } @@ -1561,15 +1567,6 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop ) if( min_y < h->i_threadslice_start ) return; - if( !b_end && b_inloop ) - for( int j = 0; j <= h->sh.b_mbaff; j++ ) - for( int i = 0; i < 3; i++ ) - { - memcpy( h->intra_border_backup[j][i], - h->fdec->plane[i] + ((mb_y*16 >> !!i) + j - 1 - h->sh.b_mbaff) * h->fdec->i_stride[i], - h->sps->i_mb_width*16 >> !!i ); - } - if( b_deblock ) for( int y = min_y; y < max_y; y += (1 << h->sh.b_mbaff) ) x264_frame_deblock_row( h, y ); @@ -1594,12 +1591,19 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop ) if( b_measure_quality ) { if( h->param.analyse.b_psnr ) - for( int i = 0; i < 3; i++ ) - h->stat.frame.i_ssd[i] += - x264_pixel_ssd_wxh( &h->pixf, - h->fdec->plane[i] + (min_y>>!!i) * h->fdec->i_stride[i], h->fdec->i_stride[i], - h->fenc->plane[i] + (min_y>>!!i) * h->fenc->i_stride[i], h->fenc->i_stride[i], - h->param.i_width >> !!i, (max_y-min_y) >> !!i ); + { + uint64_t ssd_y = x264_pixel_ssd_wxh( &h->pixf, + h->fdec->plane[0] + min_y * h->fdec->i_stride[0], h->fdec->i_stride[0], + h->fenc->plane[0] + min_y * h->fenc->i_stride[0], h->fenc->i_stride[0], + h->param.i_width, max_y-min_y ); + uint64_t ssd_uv = x264_pixel_ssd_nv12( &h->pixf, + h->fdec->plane[1] + (min_y>>1) * h->fdec->i_stride[1], h->fdec->i_stride[1], + h->fenc->plane[1] + (min_y>>1) * h->fenc->i_stride[1], h->fenc->i_stride[1], + h->param.i_width>>1, (max_y-min_y)>>1 ); + h->stat.frame.i_ssd[0] += ssd_y; + h->stat.frame.i_ssd[1] += (uint32_t)ssd_uv; + h->stat.frame.i_ssd[2] += ssd_uv>>32; + } if( h->param.analyse.b_ssim ) { @@ -2572,12 +2576,10 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current, if( pic_out->i_pts < pic_out->i_dts ) x264_log( h, X264_LOG_WARNING, "invalid DTS: PTS is less than DTS\n" ); + pic_out->img.i_csp = X264_CSP_NV12; pic_out->img.i_plane = h->fdec->i_plane; - for( int i = 0; i < 3; i++ ) - { - pic_out->img.i_stride[i] = h->fdec->i_stride[i]; - pic_out->img.plane[i] = h->fdec->plane[i]; - } + memcpy( pic_out->img.i_stride, h->fdec->i_stride, sizeof(h->fdec->i_stride) ); + memcpy( pic_out->img.plane, h->fdec->plane, sizeof(h->fdec->plane) ); x264_frame_push_unused( thread_current, h->fenc ); diff --git a/encoder/macroblock.c b/encoder/macroblock.c index a961baf..a55e09c 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -485,25 +485,16 @@ static void x264_macroblock_encode_pskip( x264_t *h ) /* Special case for mv0, which is (of course) very common in P-skip mode. */ if( mvx | mvy ) - { - h->mc.mc_chroma( h->mb.pic.p_fdec[1], FDEC_STRIDE, + h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], mvx, mvy, 8, 8 ); - h->mc.mc_chroma( h->mb.pic.p_fdec[2], FDEC_STRIDE, - h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2], - mvx, mvy, 8, 8 ); - } else - { - h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], 8 ); - h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2], 8 ); - } + h->mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] ); if( h->sh.weight[0][1].weightfn ) h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &h->sh.weight[0][1], 8 ); - if( h->sh.weight[0][2].weightfn ) h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, @@ -517,25 +508,21 @@ static void x264_macroblock_encode_pskip( x264_t *h ) * Intra prediction for predictive lossless mode. *****************************************************************************/ -/* Note that these functions take a shortcut (mc.copy instead of actual pixel prediction) which assumes - * that the edge pixels of the reconstructed frame are the same as that of the source frame. This means - * they will only work correctly if the neighboring blocks are losslessly coded. In practice, this means - * lossless mode cannot be mixed with lossy mode within a frame. */ -/* This can be resolved by explicitly copying the edge pixels after doing the mc.copy, but this doesn't - * need to be done unless we decide to allow mixing lossless and lossy compression. */ - void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode ) { - int stride = h->fenc->i_stride[1] << h->mb.b_interlaced; if( i_mode == I_PRED_CHROMA_V ) { - h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc_plane[1]-stride, stride, 8 ); - h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc_plane[2]-stride, stride, 8 ); + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, 8 ); + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, 8 ); + CP64( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[1]-FDEC_STRIDE ); + CP64( h->mb.pic.p_fdec[2], h->mb.pic.p_fdec[2]-FDEC_STRIDE ); } else if( i_mode == I_PRED_CHROMA_H ) { - h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc_plane[1]-1, stride, 8 ); - h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc_plane[2]-1, stride, 8 ); + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, 8 ); + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, 8 ); + x264_copy_column8( h->mb.pic.p_fdec[1]+4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+4*FDEC_STRIDE-1 ); + x264_copy_column8( h->mb.pic.p_fdec[2]+4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+4*FDEC_STRIDE-1 ); } else { @@ -947,28 +934,26 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir ) i_qp = h->mb.i_chroma_qp; thresh = (x264_lambda2_tab[i_qp] + 32) >> 6; + if( !b_bidir ) + { + /* Special case for mv0, which is (of course) very common in P-skip mode. */ + if( M32( mvp ) ) + h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE, + h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], + mvp[0], mvp[1], 8, 8 ); + else + h->mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] ); + } + for( int ch = 0; ch < 2; ch++ ) { uint8_t *p_src = h->mb.pic.p_fenc[1+ch]; uint8_t *p_dst = h->mb.pic.p_fdec[1+ch]; - if( !b_bidir ) - { - /* Special case for mv0, which is (of course) very common in P-skip mode. */ - if( M32( mvp ) ) - { - h->mc.mc_chroma( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, - h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch], - mvp[0], mvp[1], 8, 8 ); - } - else - h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch], 8 ); - - if( h->sh.weight[0][1+ch].weightfn ) - h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, - h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, - &h->sh.weight[0][1+ch], 8 ); - } + if( !b_bidir && h->sh.weight[0][1+ch].weightfn ) + h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, + h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, + &h->sh.weight[0][1+ch], 8 ); /* there is almost never a termination during chroma, but we can't avoid the check entirely */ /* so instead we check SSD and skip the actual check if the score is low enough. */ diff --git a/encoder/me.c b/encoder/me.c index d7b2928..3e0f5f6 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -789,18 +789,17 @@ if( b_refine_qpel || (dir^1) != odir ) \ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ if( b_chroma_me && cost < bcost ) \ { \ - h->mc.mc_chroma( pix, 8, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw/2, bh/2 ); \ + h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw/2, bh/2 ); \ if( m->weight[1].weightfn ) \ - m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 8, pix, 8, \ + m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 16, pix, 16, \ &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \ - cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix, 8 ); \ + cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \ if( cost < bcost ) \ { \ - h->mc.mc_chroma( pix, 8, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw/2, bh/2 ); \ if( m->weight[2].weightfn ) \ - m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 8, pix, 8, \ + m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix+8, 16, pix+8, 16, \ &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \ - cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix, 8 ); \ + cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \ } \ } \ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, bdir, dir ); \ @@ -923,10 +922,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite stride[list][i] = bw;\ src[list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[list][i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none ); \ if( rd )\ - {\ - h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\ - h->mc.mc_chroma( pixv_buf[list][i], 8, m->p_fref[5], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\ - }\ + h->mc.mc_chroma( pixu_buf[list][i], pixv_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\ } #define SATD_THRESH 17/16 @@ -1108,10 +1104,7 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei uint64_t cost; \ M32( cache_mv ) = pack16to32_mask(mx,my); \ if( m->i_pixel <= PIXEL_8x8 )\ - {\ - h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\ - h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\ - }\ + h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\ cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \ } \ diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c index 7f18c7f..7bd411d 100644 --- a/encoder/ratecontrol.c +++ b/encoder/ratecontrol.c @@ -203,20 +203,30 @@ static inline double qscale2bits( ratecontrol_entry_t *rce, double qscale ) + rce->misc_bits; } +static ALWAYS_INLINE uint32_t ac_energy_var( uint64_t sum_sqr, int shift ) +{ + uint32_t sum = (uint32_t)sum_sqr; + uint32_t sqr = sum_sqr >> 32; + return sqr - (sum * sum >> shift); +} + static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i ) { int w = i ? 8 : 16; - int shift = i ? 6 : 8; int stride = frame->i_stride[i]; int offset = h->mb.b_interlaced - ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride - : w * (mb_x + mb_y * stride); - int pix = i ? PIXEL_8x8 : PIXEL_16x16; + ? 16 * mb_x + w * (mb_y&~1) * stride + (mb_y&1) * stride + : 16 * mb_x + w * mb_y * stride; stride <<= h->mb.b_interlaced; - uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride ); - uint32_t sum = (uint32_t)res; - uint32_t sqr = res >> 32; - return sqr - (sum * sum >> shift); + if( i ) + { + ALIGNED_ARRAY_16( uint8_t, pix,[FENC_STRIDE*8] ); + h->mc.load_deinterleave_8x8x2_fenc( pix, frame->plane[1] + offset, stride ); + return ac_energy_var( h->pixf.var[PIXEL_8x8]( pix, FENC_STRIDE ), 6 ) + + ac_energy_var( h->pixf.var[PIXEL_8x8]( pix+FENC_STRIDE/2, FENC_STRIDE ), 6 ); + } + else + return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[0] + offset, stride ), 8 ); } // Find the total AC energy of the block in all planes. @@ -228,7 +238,6 @@ static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame * sure no reordering goes on. */ uint32_t var = ac_energy_plane( h, mb_x, mb_y, frame, 0 ); var += ac_energy_plane( h, mb_x, mb_y, frame, 1 ); - var += ac_energy_plane( h, mb_x, mb_y, frame, 2 ); x264_emms(); return var; } diff --git a/tools/checkasm.c b/tools/checkasm.c index 228b75f..7bbae4d 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -445,6 +445,25 @@ static int check_pixel( int cpu_ref, int cpu_new ) TEST_INTRA_MBCMP( intra_sad_x3_4x4 , predict_4x4 , sad [PIXEL_4x4] , 0 ); report( "intra sad_x3 :" ); + ok = 1; used_asm = 0; + if( pixel_asm.ssd_nv12_core != pixel_ref.ssd_nv12_core ) + { + used_asm = 1; + set_func_name( "ssd_nv12" ); + uint64_t res_c = pixel_c.ssd_nv12_core( buf1, 368, buf2, 368, 360, 8 ); + uint64_t res_a = pixel_asm.ssd_nv12_core( buf1, 368, buf2, 368, 360, 8 ); + if( res_c != res_a ) + { + ok = 0; + fprintf( stderr, "ssd_nv12: %u,%u != %u,%u\n", + (uint32_t)res_c, (uint32_t)(res_c>>32), + (uint32_t)res_a, (uint32_t)(res_a>>32) ); + } + call_c( pixel_c.ssd_nv12_core, buf1, 368, buf2, 368, 360, 8 ); + call_a( pixel_asm.ssd_nv12_core, buf1, 368, buf2, 368, 360, 8 ); + } + report( "ssd_nv12 :" ); + if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core || pixel_asm.ssim_end4 != pixel_ref.ssim_end4 ) { @@ -823,12 +842,15 @@ static int check_mc( int cpu_ref, int cpu_new ) used_asm = 1; \ memset( buf3, 0xCD, 1024 ); \ memset( buf4, 0xCD, 1024 ); \ - call_c( mc_c.mc_chroma, dst1, 16, src, 64, dx, dy, w, h ); \ - call_a( mc_a.mc_chroma, dst2, 16, src, 64, dx, dy, w, h ); \ + call_c( mc_c.mc_chroma, dst1, dst1+8, 16, src, 64, dx, dy, w, h ); \ + call_a( mc_a.mc_chroma, dst2, dst2+8, 16, src, 64, dx, dy, w, h ); \ /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */\ for( int j = 0; j < h; j++ ) \ - for( int i = w; i < 4; i++ ) \ + for( int i = w; i < 8; i++ ) \ + { \ + dst2[i+j*16+8] = dst1[i+j*16+8]; \ dst2[i+j*16] = dst1[i+j*16]; \ + } \ if( memcmp( buf3, buf4, 1024 ) ) \ { \ fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \ @@ -970,6 +992,121 @@ static int check_mc( int cpu_ref, int cpu_new ) } report( "mc offsetsub :" ); + ok = 1; used_asm = 0; + if( mc_a.store_interleave_8x8x2 != mc_ref.store_interleave_8x8x2 ) + { + set_func_name( "store_interleave_8x8x2" ); + used_asm = 1; + memset( buf3, 0, 64*8 ); + memset( buf4, 0, 64*8 ); + call_c( mc_c.store_interleave_8x8x2, buf3, 64, buf1, buf1+16 ); + call_a( mc_a.store_interleave_8x8x2, buf4, 64, buf1, buf1+16 ); + if( memcmp( buf3, buf4, 64*8 ) ) + ok = 0; + } + if( mc_a.load_deinterleave_8x8x2_fenc != mc_ref.load_deinterleave_8x8x2_fenc ) + { + set_func_name( "load_deinterleave_8x8x2_fenc" ); + used_asm = 1; + call_c( mc_c.load_deinterleave_8x8x2_fenc, buf3, buf1, 64 ); + call_a( mc_a.load_deinterleave_8x8x2_fenc, buf4, buf1, 64 ); + if( memcmp( buf3, buf4, FENC_STRIDE*8 ) ) + ok = 0; + } + if( mc_a.load_deinterleave_8x8x2_fdec != mc_ref.load_deinterleave_8x8x2_fdec ) + { + set_func_name( "load_deinterleave_8x8x2_fdec" ); + used_asm = 1; + call_c( mc_c.load_deinterleave_8x8x2_fdec, buf3, buf1, 64 ); + call_a( mc_a.load_deinterleave_8x8x2_fdec, buf4, buf1, 64 ); + if( memcmp( buf3, buf4, FDEC_STRIDE*8 ) ) + ok = 0; + } + report( "store_interleave :" ); + + struct plane_spec { + int w, h, src_stride; + } plane_specs[] = { {2,2,2}, {8,6,8}, {20,31,24}, {32,8,40}, {256,10,272}, {504,7,505}, {528,6,528}, {256,10,-256}, {263,9,-264}, {1904,1,0} }; + ok = 1; used_asm = 0; + if( mc_a.plane_copy != mc_ref.plane_copy ) + { + set_func_name( "plane_copy" ); + used_asm = 1; + for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ ) + { + int w = plane_specs[i].w; + int h = plane_specs[i].h; + int src_stride = plane_specs[i].src_stride; + int dst_stride = (w + 127) & ~63; + assert( dst_stride * h <= 0x1000 ); + uint8_t *src = buf1 + X264_MAX(0, -src_stride) * (h-1); + memset( buf3, 0, 0x1000 ); + memset( buf4, 0, 0x1000 ); + call_c( mc_c.plane_copy, buf3, dst_stride, src, src_stride, w, h ); + call_a( mc_a.plane_copy, buf4, dst_stride, src, src_stride, w, h ); + for( int y = 0; y < h; y++ ) + if( memcmp( buf3+y*dst_stride, buf4+y*dst_stride, w ) ) + { + ok = 0; + fprintf( stderr, "plane_copy FAILED: w=%d h=%d stride=%d\n", w, h, src_stride ); + break; + } + } + } + + if( mc_a.plane_copy_interleave != mc_ref.plane_copy_interleave ) + { + set_func_name( "plane_copy_interleave" ); + used_asm = 1; + for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ ) + { + int w = (plane_specs[i].w + 1) >> 1; + int h = plane_specs[i].h; + int src_stride = (plane_specs[i].src_stride + 1) >> 1; + int dst_stride = (2*w + 127) & ~63; + assert( dst_stride * h <= 0x1000 ); + uint8_t *src = buf1 + X264_MAX(0, -src_stride) * (h-1); + memset( buf3, 0, 0x1000 ); + memset( buf4, 0, 0x1000 ); + call_c( mc_c.plane_copy_interleave, buf3, dst_stride, src, src_stride, src+1024, src_stride+16, w, h ); + call_a( mc_a.plane_copy_interleave, buf4, dst_stride, src, src_stride, src+1024, src_stride+16, w, h ); + for( int y = 0; y < h; y++ ) + if( memcmp( buf3+y*dst_stride, buf4+y*dst_stride, 2*w ) ) + { + ok = 0; + fprintf( stderr, "plane_copy_interleave FAILED: w=%d h=%d stride=%d\n", w, h, src_stride ); + break; + } + } + } + + if( mc_a.plane_copy_deinterleave != mc_ref.plane_copy_deinterleave ) + { + set_func_name( "plane_copy_deinterleave" ); + used_asm = 1; + for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ ) + { + int w = (plane_specs[i].w + 1) >> 1; + int h = plane_specs[i].h; + int dst_stride = w; + int src_stride = (2*w + 127) & ~63; + int offv = (dst_stride*h + 31) & ~15; + memset( buf3, 0, 0x1000 ); + memset( buf4, 0, 0x1000 ); + call_c( mc_c.plane_copy_deinterleave, buf3, dst_stride, buf3+offv, dst_stride, buf1, src_stride, w, h ); + call_a( mc_a.plane_copy_deinterleave, buf4, dst_stride, buf4+offv, dst_stride, buf1, src_stride, w, h ); + for( int y = 0; y < h; y++ ) + if( memcmp( buf3+y*dst_stride, buf4+y*dst_stride, w ) || + memcmp( buf3+y*dst_stride+offv, buf4+y*dst_stride+offv, w ) ) + { + ok = 0; + fprintf( stderr, "plane_copy_deinterleave FAILED: w=%d h=%d stride=%d\n", w, h, src_stride ); + break; + } + } + } + report( "plane_copy :" ); + if( mc_a.hpel_filter != mc_ref.hpel_filter ) { uint8_t *srchpel = buf1+8+2*64; diff --git a/x264.h b/x264.h index 83f087e..499da19 100644 --- a/x264.h +++ b/x264.h @@ -35,7 +35,7 @@ #include -#define X264_BUILD 94 +#define X264_BUILD 95 /* x264_t: * opaque handler for encoder */ @@ -126,7 +126,8 @@ static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 }; #define X264_CSP_RGB 0x0006 /* rgb 24bits */ #define X264_CSP_BGR 0x0007 /* bgr 24bits */ #define X264_CSP_BGRA 0x0008 /* bgr 32bits */ -#define X264_CSP_MAX 0x0009 /* end of list */ +#define X264_CSP_NV12 0x0009 /* yuv 4:2:0, with one y plane and one packed u+v */ +#define X264_CSP_MAX 0x0010 /* end of list */ #define X264_CSP_VFLIP 0x1000 /* */ /* Slice type */