From 4e3d590d159ef6cea2756cb760e2d3ea1e881612 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Thu, 6 May 2010 05:36:40 +0000 Subject: [PATCH] Convert internal pixel format from YV12 to NV12 ~1% faster on Conroe. Mostly due to improved cache locality, but also a little bit of increased simd width in mc_chroma and deblock_chroma. (insert more benchmarks here) Will be slower on arm and ppc until their asm is updated. --- common/arm/mc-c.c | 2 +- common/common.h | 8 +- common/deblock.c | 70 +++--- common/frame.c | 137 ++++++---- common/frame.h | 8 +- common/macroblock.c | 154 +++++++---- common/macroblock.h | 2 + common/mc.c | 67 ++++- common/mc.h | 14 +- common/pixel.c | 32 ++- common/pixel.h | 5 +- common/ppc/mc.c | 2 +- common/x86/deblock-a.asm | 354 +++++++++++++++--------- common/x86/mc-a.asm | 689 +++++++++++++++++++++++++++------------------- common/x86/mc-a2.asm | 215 ++++++++++++++- common/x86/mc-c.c | 107 ++++++-- common/x86/pixel-a.asm | 54 ++++- common/x86/pixel.h | 4 + common/x86/x86inc.asm | 35 ++- encoder/analyse.c | 9 +- encoder/encoder.c | 51 ++-- encoder/macroblock.c | 65 ++--- encoder/me.c | 21 +- encoder/ratecontrol.c | 31 ++- tools/checkasm.c | 143 ++++++++++- x264.h | 3 +- 26 files changed, 1570 insertions(+), 712 deletions(-) diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c index b1106dd..7467150 100644 --- a/common/arm/mc-c.c +++ b/common/arm/mc-c.c @@ -234,7 +234,7 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf ) pf->offsetsub = x264_mc_offsetsub_wtab_neon; pf->weight_cache = x264_weight_cache_neon; - pf->mc_chroma = x264_mc_chroma_neon; +// pf->mc_chroma = x264_mc_chroma_neon; pf->mc_luma = mc_luma_neon; pf->get_ref = get_ref_neon; pf->hpel_filter = hpel_filter_neon; diff --git a/common/common.h b/common/common.h index ca27968..6af92ea 100644 --- a/common/common.h +++ b/common/common.h @@ -668,16 +668,16 @@ struct x264_t ALIGNED_16( uint32_t fenc_satd_cache[32] ); /* pointer over mb of the frame to be compressed */ - pixel *p_fenc[3]; + pixel *p_fenc[3]; /* y,u,v */ /* pointer to the actual source frame, not a block copy */ - pixel *p_fenc_plane[3]; + pixel *p_fenc_plane[2]; /* y,uv */ /* pointer over mb of the frame to be reconstructed */ pixel *p_fdec[3]; /* pointer over mb of the references */ int i_fref[2]; - pixel *p_fref[2][32][4+2]; /* last: lN, lH, lV, lHV, cU, cV */ + pixel *p_fref[2][32][4+1]; /* last: yN, yH, yV, yHV, uv */ pixel *p_fref_w[32]; /* weighted fullpel luma */ uint16_t *p_integral[2][16]; @@ -813,7 +813,7 @@ struct x264_t /* Buffers that are allocated per-thread even in sliced threads. */ void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */ - pixel *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */ + pixel *intra_border_backup[2][2]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */ uint8_t (*deblock_strength[2])[2][4][4]; /* CPU functions dependents */ diff --git a/common/deblock.c b/common/deblock.c index a439d8d..55039b5 100644 --- a/common/deblock.c +++ b/common/deblock.c @@ -134,7 +134,8 @@ static inline void deblock_chroma_c( pixel *pix, int xstride, int ystride, int a pix += 2*ystride; continue; } - for( int d = 0; d < 2; d++ ) + for( int d = 0; d < 2; d++, pix += ystride-2 ) + for( int e = 0; e < 2; e++, pix++ ) { int p1 = pix[-2*xstride]; int p0 = pix[-1*xstride]; @@ -147,17 +148,16 @@ static inline void deblock_chroma_c( pixel *pix, int xstride, int ystride, int a pix[-1*xstride] = x264_clip_pixel( p0 + delta ); /* p0' */ pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */ } - pix += ystride; } } } static void deblock_v_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) { - deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 ); + deblock_chroma_c( pix, stride, 2, alpha, beta, tc0 ); } static void deblock_h_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) { - deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 ); + deblock_chroma_c( pix, 2, stride, alpha, beta, tc0 ); } static inline void deblock_luma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta ) @@ -212,9 +212,10 @@ static void deblock_h_luma_intra_c( pixel *pix, int stride, int alpha, int beta deblock_luma_intra_c( pix, 1, stride, alpha, beta ); } -static inline void deblock_chroma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta ) +static inline void deblock_chroma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int dir ) { - for( int d = 0; d < 8; d++ ) + for( int d = 0; d < (dir?16:8); d++, pix += ystride-2 ) + for( int e = 0; e < (dir?1:2); e++, pix++ ) { int p1 = pix[-2*xstride]; int p0 = pix[-1*xstride]; @@ -226,16 +227,15 @@ static inline void deblock_chroma_intra_c( pixel *pix, int xstride, int ystride, pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */ pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */ } - pix += ystride; } } static void deblock_v_chroma_intra_c( pixel *pix, int stride, int alpha, int beta ) { - deblock_chroma_intra_c( pix, stride, 1, alpha, beta ); + deblock_chroma_intra_c( pix, stride, 2, alpha, beta, 1 ); } static void deblock_h_chroma_intra_c( pixel *pix, int stride, int alpha, int beta ) { - deblock_chroma_intra_c( pix, 1, stride, alpha, beta ); + deblock_chroma_intra_c( pix, 2, stride, alpha, beta, 0 ); } static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], @@ -267,7 +267,7 @@ static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264 } } -static inline void deblock_edge( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter ) +static inline void deblock_edge( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter ) { int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset; int index_b = i_qp-QP_BD_OFFSET + h->sh.i_beta_offset; @@ -283,12 +283,10 @@ static inline void deblock_edge( x264_t *h, pixel *pix1, pixel *pix2, int i_stri tc[2] = (tc0_table(index_a)[bS[2]] << (BIT_DEPTH-8)) + b_chroma; tc[3] = (tc0_table(index_a)[bS[3]] << (BIT_DEPTH-8)) + b_chroma; - pf_inter( pix1, i_stride, alpha, beta, tc ); - if( b_chroma ) - pf_inter( pix2, i_stride, alpha, beta, tc ); + pf_inter( pix, i_stride, alpha, beta, tc ); } -static inline void deblock_edge_intra( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra ) +static inline void deblock_edge_intra( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra ) { int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset; int index_b = i_qp-QP_BD_OFFSET + h->sh.i_beta_offset; @@ -298,9 +296,7 @@ static inline void deblock_edge_intra( x264_t *h, pixel *pix1, pixel *pix2, int if( !alpha || !beta ) return; - pf_intra( pix1, i_stride, alpha, beta ); - if( b_chroma ) - pf_intra( pix2, i_stride, alpha, beta ); + pf_intra( pix, i_stride, alpha, beta ); } void x264_frame_deblock_row( x264_t *h, int mb_y ) @@ -323,13 +319,11 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) uint8_t (*bs)[4][4] = h->deblock_strength[mb_y&b_interlaced][mb_x]; pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x; - pixel *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x; - pixel *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x; + pixel *pixuv = h->fdec->plane[1] + 8*mb_y*strideuv + 16*mb_x; if( mb_y & b_interlaced ) { pixy -= 15*stridey; - pixu -= 7*strideuv; - pixv -= 7*strideuv; + pixuv -= 7*strideuv; } int qp = h->mb.qp[mb_xy]; @@ -339,11 +333,11 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) #define FILTER( intra, dir, edge, qp, chroma_qp )\ do\ {\ - deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1), NULL,\ + deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\ stride2y, bs[dir][edge], qp, 0,\ h->loopf.deblock_luma##intra[dir] );\ if( !(edge & 1) )\ - deblock_edge##intra( h, pixu + 2*edge*(dir?stride2uv:1), pixv + 2*edge*(dir?stride2uv:1),\ + deblock_edge##intra( h, pixuv + 2*edge*(dir?stride2uv:2),\ stride2uv, bs[dir][edge], chroma_qp, 1,\ h->loopf.deblock_chroma##intra[dir] );\ } while(0) @@ -393,15 +387,14 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) } #if HAVE_MMX -void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); -void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); - void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_chroma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ); void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_v_chroma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ); void x264_deblock_strength_mmxext( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4], int mvy_limit, int bframe ); @@ -414,9 +407,14 @@ void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X #if ARCH_X86 void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ); +// FIXME this wrapper has a significant cpu cost static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) { x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 ); @@ -458,15 +456,15 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) if( cpu&X264_CPU_MMXEXT ) { #if !X264_HIGH_BIT_DEPTH - pf->deblock_chroma[1] = x264_deblock_v_chroma_mmxext; - pf->deblock_chroma[0] = x264_deblock_h_chroma_mmxext; - pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmxext; - pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_mmxext; #if ARCH_X86 pf->deblock_luma[1] = x264_deblock_v_luma_mmxext; pf->deblock_luma[0] = x264_deblock_h_luma_mmxext; + pf->deblock_chroma[1] = x264_deblock_v_chroma_mmxext; + pf->deblock_chroma[0] = x264_deblock_h_chroma_mmxext; pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmxext; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmxext; + pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmxext; + pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_mmxext; #endif #endif // !X264_HIGH_BIT_DEPTH pf->deblock_strength = x264_deblock_strength_mmxext; @@ -478,8 +476,12 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) { pf->deblock_luma[1] = x264_deblock_v_luma_sse2; pf->deblock_luma[0] = x264_deblock_h_luma_sse2; + pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2; + pf->deblock_chroma[0] = x264_deblock_h_chroma_sse2; pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2; + pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2; + pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_sse2; } #endif // !X264_HIGH_BIT_DEPTH } @@ -502,8 +504,8 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) { pf->deblock_luma[1] = x264_deblock_v_luma_neon; pf->deblock_luma[0] = x264_deblock_h_luma_neon; - pf->deblock_chroma[1] = x264_deblock_v_chroma_neon; - pf->deblock_chroma[0] = x264_deblock_h_chroma_neon; +// pf->deblock_chroma[1] = x264_deblock_v_chroma_neon; +// pf->deblock_chroma[0] = x264_deblock_h_chroma_neon; } #endif #endif // !X264_HIGH_BIT_DEPTH diff --git a/common/frame.c b/common/frame.c index d862468..3fd1801 100644 --- a/common/frame.c +++ b/common/frame.c @@ -42,20 +42,12 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) i_stride = ALIGN( i_width + 2*PADH, align ); i_lines = h->mb.i_mb_height*16; - frame->i_plane = 3; - for( int i = 0; i < 3; i++ ) + frame->i_plane = 2; + for( int i = 0; i < 2; i++ ) { - frame->i_stride[i] = ALIGN( i_stride >> !!i, align ); - frame->i_width[i] = i_width >> !!i; - frame->i_lines[i] = i_lines >> !!i; - } - - luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv)); - chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv)); - for( int i = 1; i < 3; i++ ) - { - CHECKED_MALLOC( frame->buffer[i], chroma_plane_size * sizeof(pixel) ); - frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2; + frame->i_stride[i] = ALIGN( i_stride, align ); + frame->i_width[i] = i_width >> i; + frame->i_lines[i] = i_lines >> i; } for( int i = 0; i < h->param.i_bframe + 2; i++ ) @@ -81,6 +73,12 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) frame->orig = frame; + luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv)); + chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + i_padv)); + + CHECKED_MALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) ); + frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * i_padv/2 + PADH; + /* all 4 luma planes allocated together, since the cacheline split code * requires them to be in-phase wrt cacheline alignment. */ if( h->param.analyse.i_subpel_refine && b_fdec ) @@ -214,10 +212,31 @@ void x264_frame_delete( x264_frame_t *frame ) x264_free( frame ); } +static int get_plane_ptr( x264_t *h, x264_picture_t *src, uint8_t **pix, int *stride, int plane, int xshift, int yshift ) +{ + int width = h->param.i_width >> xshift; + int height = h->param.i_height >> yshift; + *pix = src->img.plane[plane]; + *stride = src->img.i_stride[plane]; + if( src->img.i_csp & X264_CSP_VFLIP ) + { + *pix += (height-1) * *stride; + *stride = -*stride; + } + if( width > abs(*stride) ) + { + x264_log( h, X264_LOG_ERROR, "Input picture width (%d) is greater than stride (%d)\n", width, *stride ); + return -1; + } + return 0; +} + +#define get_plane_ptr(...) do{ if( get_plane_ptr(__VA_ARGS__) < 0 ) return -1; }while(0) + int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src ) { int i_csp = src->img.i_csp & X264_CSP_MASK; - if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 ) + if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 && i_csp != X264_CSP_NV12 ) { x264_log( h, X264_LOG_ERROR, "Invalid input colorspace\n" ); return -1; @@ -229,43 +248,53 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src ) dst->param = src->param; dst->i_pic_struct = src->i_pic_struct; - for( int i = 0; i < 3; i++ ) + uint8_t *pix[3]; + int stride[3]; + get_plane_ptr( h, src, &pix[0], &stride[0], 0, 0, 0 ); + h->mc.plane_copy( dst->plane[0], dst->i_stride[0], pix[0], stride[0], + h->param.i_width, h->param.i_height ); + if( i_csp == X264_CSP_NV12 ) { - int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i; - uint8_t *plane = src->img.plane[s]; - int stride = src->img.i_stride[s]; - int width = h->param.i_width >> !!i; - int height = h->param.i_height >> !!i; - if( src->img.i_csp & X264_CSP_VFLIP ) - { - plane += (height-1)*stride; - stride = -stride; - } - if( width > abs(stride) ) - { - x264_log( h, X264_LOG_ERROR, "Input picture width is greater than stride\n" ); - return -1; - } - h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height ); + get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, 1 ); + h->mc.plane_copy( dst->plane[1], dst->i_stride[1], pix[1], stride[1], + h->param.i_width, h->param.i_height>>1 ); + } + else + { + get_plane_ptr( h, src, &pix[1], &stride[1], i_csp==X264_CSP_I420 ? 1 : 2, 1, 1 ); + get_plane_ptr( h, src, &pix[2], &stride[2], i_csp==X264_CSP_I420 ? 2 : 1, 1, 1 ); + h->mc.plane_copy_interleave( dst->plane[1], dst->i_stride[1], + pix[1], stride[1], pix[2], stride[2], + h->param.i_width>>1, h->param.i_height>>1 ); } return 0; } -static void ALWAYS_INLINE pixel_memset( pixel *dst, int value, int size ) +static void ALWAYS_INLINE pixel_memset( pixel *dst, pixel *src, int len, int size ) { - for( int i = 0; i < size; i++ ) - dst[i] = value; + uint8_t *dstp = (uint8_t*)dst; + if(size == 1) { + memset(dst, *src, len); + } else if(size == 2) { + int v = M16( src ); + for(int i=0; i>b_chroma, sizeof(pixel)<>b_chroma, sizeof(pixel)<i_plane; i++ ) { int stride = frame->i_stride[i]; - int width = 16*h->mb.i_mb_width >> !!i; + int width = 16*h->sps->i_mb_width; int height = (b_end ? 16*(h->mb.i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i; - int padh = PADH >> !!i; + int padh = PADH; int padv = PADV >> !!i; // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb pixel *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i); @@ -296,12 +325,12 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e height += 4 >> (!!i + h->sh.b_mbaff); if( h->sh.b_mbaff ) { - plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end ); - plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end ); + plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, i ); + plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, i ); } else { - plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end ); + plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, i ); } } } @@ -323,37 +352,35 @@ void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y pixel *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4; if( h->sh.b_mbaff ) { - plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end ); - plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end ); + plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, 0 ); + plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, 0 ); } else - plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end ); + plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, 0 ); } } void x264_frame_expand_border_lowres( x264_frame_t *frame ) { for( int i = 0; i < 4; i++ ) - plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 ); + plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1, 0 ); } void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame ) { for( int i = 0; i < frame->i_plane; i++ ) { - int i_subsample = i ? 1 : 0; - int i_width = h->param.i_width >> i_subsample; - int i_height = h->param.i_height >> i_subsample; - int i_padx = (h->mb.i_mb_width * 16 - h->param.i_width) >> i_subsample; - int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> i_subsample; + int i_width = h->param.i_width; + int i_height = h->param.i_height >> !!i; + int i_padx = (h->mb.i_mb_width * 16 - h->param.i_width); + int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> !!i; if( i_padx ) { for( int y = 0; y < i_height; y++ ) - { - pixel value = frame->plane[i][y*frame->i_stride[i] + i_width - 1]; - pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width], value, i_padx ); - } + pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width], + &frame->plane[i][y*frame->i_stride[i] + i_width - 1-i], + i_padx>>i, sizeof(pixel)<mb.b_interlaced & i_ref ) mvy += (h->mb.i_mb_y & 1)*4 - 2; - h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, + h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], + &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1], mvx, mvy, 2*width, 2*height ); @@ -48,11 +49,6 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, &h->sh.weight[i_ref][1], height*2 ); - - h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, - h->mb.pic.p_fref[0][i_ref][5], h->mb.pic.i_stride[2], - mvx, mvy, 2*width, 2*height ); - if( h->sh.weight[i_ref][2].weightfn ) h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, @@ -73,13 +69,10 @@ static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int h if( h->mb.b_interlaced & i_ref ) mvy += (h->mb.i_mb_y & 1)*4 - 2; - h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, + h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], + &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1], mvx, mvy, 2*width, 2*height ); - - h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, - h->mb.pic.p_fref[1][i_ref][5], h->mb.pic.i_stride[2], - mvx, mvy, 2*width, 2*height ); } static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height ) @@ -110,16 +103,12 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int if( h->mb.b_interlaced & i_ref1 ) mvy1 += (h->mb.i_mb_y & 1)*4 - 2; - h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1], + h->mc.mc_chroma( tmp0, tmp0+8, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1], mvx0, mvy0, 2*width, 2*height ); - h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1], + h->mc.mc_chroma( tmp1, tmp1+8, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1], mvx1, mvy1, 2*width, 2*height ); h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); - h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][5], h->mb.pic.i_stride[2], - mvx0, mvy0, 2*width, 2*height ); - h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][5], h->mb.pic.i_stride[2], - mvx1, mvy1, 2*width, 2*height ); - h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); + h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight ); } void x264_mb_mc_8x8( x264_t *h, int i8 ) @@ -324,11 +313,11 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead ) if( !b_lookahead ) for( int i = 0; i <= h->param.b_interlaced; i++ ) { - for( int j = 0; j < 3; j++ ) + for( int j = 0; j < 2; j++ ) { /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */ - CHECKED_MALLOCZERO( h->intra_border_backup[i][j], ((h->mb.i_mb_width*16+32)>>!!j) * sizeof(pixel) ); - h->intra_border_backup[i][j] += 8; + CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32) * sizeof(pixel) ); + h->intra_border_backup[i][j] += 16; } CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->mb.i_mb_width ); } @@ -359,8 +348,8 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead ) for( int i = 0; i <= h->param.b_interlaced; i++ ) { x264_free( h->deblock_strength[i] ); - for( int j = 0; j < 3; j++ ) - x264_free( h->intra_border_backup[i][j] - 8 ); + for( int j = 0; j < 2; j++ ) + x264_free( h->intra_border_backup[i][j] - 16 ); } x264_free( h->scratch_buffer ); } @@ -480,13 +469,13 @@ void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y ) { int stride_y = fenc->i_stride[0]; int stride_uv = fenc->i_stride[1]; - int off_y = 16 * (i_mb_x + i_mb_y * stride_y); - int off_uv = 8 * (i_mb_x + i_mb_y * stride_uv); + int off_y = 16 * i_mb_x + 16 * i_mb_y * stride_y; + int off_uv = 16 * i_mb_x + 8 * i_mb_y * stride_uv; h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y, - fenc->plane[1+(i_mb_x&1)]+off_uv, stride_uv, i_mb_x ); + fenc->plane[1]+off_uv, stride_uv, i_mb_x ); } -static NOINLINE void copy_column8( pixel *dst, pixel *src ) +NOINLINE void x264_copy_column8( pixel *dst, pixel *src ) { // input pointers are offset by 4 rows because that's faster (smaller instruction size on x86) for( int i = -4; i < 4; i++ ) @@ -495,30 +484,44 @@ static NOINLINE void copy_column8( pixel *dst, pixel *src ) static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x, int mb_y, int i, int b_interlaced ) { - const int w = (i == 0 ? 16 : 8); - const int i_stride = h->fdec->i_stride[!!i]; - const int i_stride2 = i_stride << b_interlaced; - const int i_pix_offset = b_interlaced - ? w * (mb_x + (mb_y&~1) * i_stride) + (mb_y&1) * i_stride - : w * (mb_x + mb_y * i_stride); - const pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset]; - const pixel *intra_fdec = &h->intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16>>!!i]; + int w = (i ? 8 : 16); + int i_stride = h->fdec->i_stride[i]; + int i_stride2 = i_stride << b_interlaced; + int i_pix_offset = b_interlaced + ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride + : 16 * mb_x + w * mb_y * i_stride; + pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset]; + pixel *intra_fdec = &h->intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16]; int ref_pix_offset[2] = { i_pix_offset, i_pix_offset }; x264_frame_t **fref[2] = { h->fref0, h->fref1 }; if( b_interlaced ) ref_pix_offset[1] += (1-2*(mb_y&1)) * i_stride; h->mb.pic.i_stride[i] = i_stride2; h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset]; - h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE, - h->mb.pic.p_fenc_plane[i], i_stride2, w ); - memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, (w*3/2+1) * sizeof(pixel) ); + if( i ) + { + h->mc.load_deinterleave_8x8x2_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2 ); + memcpy( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*sizeof(pixel) ); + memcpy( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*sizeof(pixel) ); + } + else + { + h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fenc_plane[0], i_stride2, 16 ); + memcpy( h->mb.pic.p_fdec[0]-FDEC_STRIDE, intra_fdec, 24*sizeof(pixel) ); + } if( b_interlaced ) for( int j = 0; j < w; j++ ) - h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; + if( i ) + { + h->mb.pic.p_fdec[1][-1+j*FDEC_STRIDE] = plane_fdec[-2+j*i_stride2]; + h->mb.pic.p_fdec[2][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; + } + else + h->mb.pic.p_fdec[0][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; for( int j = 0; j < h->mb.pic.i_fref[0]; j++ ) { - h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &fref[0][j >> b_interlaced]->plane[i][ref_pix_offset[j&1]]; - if( i == 0 ) + h->mb.pic.p_fref[0][j][i?4:0] = &fref[0][j >> b_interlaced]->plane[i][ref_pix_offset[j&1]]; + if( !i ) { for( int k = 1; k < 4; k++ ) h->mb.pic.p_fref[0][j][k] = &fref[0][j >> b_interlaced]->filtered[k][ref_pix_offset[j&1]]; @@ -531,8 +534,8 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x if( h->sh.i_type == SLICE_TYPE_B ) for( int j = 0; j < h->mb.pic.i_fref[1]; j++ ) { - h->mb.pic.p_fref[1][j][i==0 ? 0:i+3] = &fref[1][j >> b_interlaced]->plane[i][ref_pix_offset[j&1]]; - if( i == 0 ) + h->mb.pic.p_fref[1][j][i?4:0] = &fref[1][j >> b_interlaced]->plane[i][ref_pix_offset[j&1]]; + if( !i ) for( int k = 1; k < 4; k++ ) h->mb.pic.p_fref[1][j][k] = &fref[1][j >> b_interlaced]->filtered[k][ref_pix_offset[j&1]]; } @@ -746,19 +749,17 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y ) if( !h->mb.b_interlaced ) { - copy_column8( h->mb.pic.p_fdec[0]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+ 4*FDEC_STRIDE ); - copy_column8( h->mb.pic.p_fdec[0]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+12*FDEC_STRIDE ); - copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+ 4*FDEC_STRIDE ); - copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+ 4*FDEC_STRIDE ); + x264_copy_column8( h->mb.pic.p_fdec[0]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+ 4*FDEC_STRIDE ); + x264_copy_column8( h->mb.pic.p_fdec[0]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+12*FDEC_STRIDE ); + x264_copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+ 4*FDEC_STRIDE ); + x264_copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+ 4*FDEC_STRIDE ); x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 0, 0 ); x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 0 ); - x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 2, 0 ); } else { x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 0, 1 ); x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 1 ); - x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 2, 1 ); } if( h->fdec->integral ) @@ -1058,16 +1059,42 @@ void x264_macroblock_cache_load_deblock( x264_t *h ) } } -static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i ) +static void ALWAYS_INLINE twiddle_topleft_pixel( pixel *dst, pixel *src, int b_interlaced ) +{ + // We update intra_border_backup in-place, so the topleft neighbor will no longer + // exist there when load_pic_pointers wants it. Move it within p_fdec instead. + if( b_interlaced ) + { + dst[0] = dst[-1]; + dst[-1] = src[0]; + } + else + dst[0] = src[0]; +} + +static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb_y, int i, int b_interlaced ) { int w = i ? 8 : 16; - int i_stride = h->fdec->i_stride[!!i]; - int i_stride2 = i_stride << h->mb.b_interlaced; - int i_pix_offset = h->mb.b_interlaced - ? w * (h->mb.i_mb_x + (h->mb.i_mb_y&~1) * i_stride) + (h->mb.i_mb_y&1) * i_stride - : w * (h->mb.i_mb_x + h->mb.i_mb_y * i_stride); - h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( &h->fdec->plane[i][i_pix_offset], i_stride2, - h->mb.pic.p_fdec[i], FDEC_STRIDE, w ); + int i_stride = h->fdec->i_stride[i]; + int i_stride2 = i_stride << b_interlaced; + int i_pix_offset = b_interlaced + ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride + : 16 * mb_x + w * mb_y * i_stride; + pixel *intra_fdec = &h->intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16]; + if( i ) + { + h->mc.store_interleave_8x8x2( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2] ); + memcpy( intra_fdec, h->mb.pic.p_fdec[1]+FDEC_STRIDE*7, 8*sizeof(pixel) ); + memcpy( intra_fdec+8, h->mb.pic.p_fdec[2]+FDEC_STRIDE*7, 8*sizeof(pixel) ); + twiddle_topleft_pixel( h->mb.pic.p_fdec[1]-FDEC_STRIDE-1, h->mb.pic.p_fdec[1]-FDEC_STRIDE+7, b_interlaced ); + twiddle_topleft_pixel( h->mb.pic.p_fdec[2]-FDEC_STRIDE-1, h->mb.pic.p_fdec[2]-FDEC_STRIDE+7, b_interlaced ); + } + else + { + h->mc.copy[PIXEL_16x16]( &h->fdec->plane[0][i_pix_offset], i_stride2, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 ); + memcpy( intra_fdec, h->mb.pic.p_fdec[0]+FDEC_STRIDE*15, 16*sizeof(pixel) ); + twiddle_topleft_pixel( h->mb.pic.p_fdec[0]-FDEC_STRIDE-1, h->mb.pic.p_fdec[0]-FDEC_STRIDE+15, b_interlaced ); + } } void x264_macroblock_cache_save( x264_t *h ) @@ -1084,9 +1111,16 @@ void x264_macroblock_cache_save( x264_t *h ) int8_t *i4x4 = h->mb.intra4x4_pred_mode[i_mb_xy]; uint8_t *nnz = h->mb.non_zero_count[i_mb_xy]; - x264_macroblock_store_pic( h, 0 ); - x264_macroblock_store_pic( h, 1 ); - x264_macroblock_store_pic( h, 2 ); + if( h->mb.b_interlaced ) + { + x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 1 ); + x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 1 ); + } + else + { + x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 0 ); + x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 0 ); + } x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y ); diff --git a/common/macroblock.h b/common/macroblock.h index e09cd55..4b4680c 100644 --- a/common/macroblock.h +++ b/common/macroblock.h @@ -292,6 +292,8 @@ void x264_macroblock_bipred_init( x264_t *h ); void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y ); +void x264_copy_column8( pixel *dst, pixel *src ); + /* x264_mb_predict_mv_16x16: * set mvp with predicted mv for D_16x16 block * h->mb. need only valid values from other blocks */ diff --git a/common/mc.c b/common/mc.c index 5ef0682..b1d5a12 100644 --- a/common/mc.c +++ b/common/mc.c @@ -252,7 +252,7 @@ static pixel *get_ref( pixel *dst, int *i_dst_stride, } /* full chroma mc (ie until 1/8 pixel)*/ -static void mc_chroma( pixel *dst, int i_dst_stride, +static void mc_chroma( pixel *dstu, pixel *dstv, int i_dst_stride, pixel *src, int i_src_stride, int mvx, int mvy, int i_width, int i_height ) @@ -266,14 +266,20 @@ static void mc_chroma( pixel *dst, int i_dst_stride, int cC = (8-d8x)*d8y; int cD = d8x *d8y; - src += (mvy >> 3) * i_src_stride + (mvx >> 3); + src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2; srcp = &src[i_src_stride]; for( int y = 0; y < i_height; y++ ) { for( int x = 0; x < i_width; x++ ) - dst[x] = ( cA*src[x] + cB*src[x+1] + cC*srcp[x] + cD*srcp[x+1] + 32 ) >> 6; - dst += i_dst_stride; + { + dstu[x] = ( cA*src[2*x] + cB*src[2*x+2] + + cC*srcp[2*x] + cD*srcp[2*x+2] + 32 ) >> 6; + dstv[x] = ( cA*src[2*x+1] + cB*src[2*x+3] + + cC*srcp[2*x+1] + cD*srcp[2*x+3] + 32 ) >> 6; + } + dstu += i_dst_stride; + dstv += i_dst_stride; src = srcp; srcp += i_src_stride; } @@ -289,7 +295,7 @@ MC_COPY( 8 ) MC_COPY( 4 ) void x264_plane_copy_c( pixel *dst, int i_dst, - uint8_t *src, int i_src, int w, int h) + uint8_t *src, int i_src, int w, int h ) { while( h-- ) { @@ -304,6 +310,50 @@ void x264_plane_copy_c( pixel *dst, int i_dst, } } +void x264_plane_copy_interleave_c( pixel *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ) +{ + for( int y=0; ycopy[PIXEL_8x8] = mc_copy_w8; pf->copy[PIXEL_4x4] = mc_copy_w4; + pf->store_interleave_8x8x2 = store_interleave_8x8x2; + pf->load_deinterleave_8x8x2_fenc = load_deinterleave_8x8x2_fenc; + pf->load_deinterleave_8x8x2_fdec = load_deinterleave_8x8x2_fdec; + pf->plane_copy = x264_plane_copy_c; + pf->plane_copy_interleave = x264_plane_copy_interleave_c; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c; + pf->hpel_filter = hpel_filter; pf->prefetch_fenc = prefetch_fenc_null; diff --git a/common/mc.h b/common/mc.h index cbdf1a6..58fc664 100644 --- a/common/mc.h +++ b/common/mc.h @@ -68,7 +68,7 @@ typedef struct /* mc_chroma may write up to 2 bytes of garbage to the right of dst, * so it must be run from left to right. */ - void (*mc_chroma)(pixel *dst, int i_dst, pixel *src, int i_src, + void (*mc_chroma)(pixel *dstu, pixel *dstv, int i_dst, pixel *src, int i_src, int mvx, int mvy, int i_width, int i_height ); @@ -78,8 +78,18 @@ typedef struct void (*copy[7])( pixel *dst, int, pixel *src, int, int i_height ); void (*copy_16x16_unaligned)( pixel *dst, int, pixel *src, int, int i_height ); + void (*store_interleave_8x8x2)( pixel *dst, int i_dst, pixel *srcu, pixel *srcv ); + void (*load_deinterleave_8x8x2_fenc)( pixel *dst, pixel *src, int i_src ); + void (*load_deinterleave_8x8x2_fdec)( pixel *dst, pixel *src, int i_src ); + void (*plane_copy)( pixel *dst, int i_dst, - uint8_t *src, int i_src, int w, int h); + uint8_t *src, int i_src, int w, int h ); + void (*plane_copy_interleave)( pixel *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ); + void (*plane_copy_deinterleave)( pixel *dstu, int i_dstu, + pixel *dstv, int i_dstv, + pixel *src, int i_src, int w, int h ); void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src, int i_stride, int i_width, int i_height, dctcoef *buf ); diff --git a/common/pixel.c b/common/pixel.c index 069589f..63d427f 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -96,9 +96,9 @@ PIXEL_SSD_C( x264_pixel_ssd_8x4, 8, 4 ) PIXEL_SSD_C( x264_pixel_ssd_4x8, 4, 8 ) PIXEL_SSD_C( x264_pixel_ssd_4x4, 4, 4 ) -int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height ) +uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height ) { - int64_t i_ssd = 0; + uint64_t i_ssd = 0; int y; int align = !(((intptr_t)pix1 | (intptr_t)pix2 | i_pix1 | i_pix2) & 15); @@ -136,6 +136,31 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, return i_ssd; } +static uint64_t pixel_ssd_nv12_core( pixel *pixuv1, int stride1, pixel *pixuv2, int stride2, int width, int height ) +{ + uint32_t ssd_u=0, ssd_v=0; + for( int y = 0; y < height; y++, pixuv1+=stride1, pixuv2+=stride2 ) + for( int x = 0; x < width; x++ ) + { + int du = pixuv1[2*x] - pixuv2[2*x]; + int dv = pixuv1[2*x+1] - pixuv2[2*x+1]; + ssd_u += du*du; + ssd_v += dv*dv; + } + return ssd_u + ((uint64_t)ssd_v<<32); +} + +// SSD in uint32 (i.e. packing two into uint64) can potentially overflow on +// image widths >= 11008 (or 6604 if interlaced), since this is called on blocks +// of height up to 12 (resp 20). Though it will probably take significantly more +// than that at sane distortion levels. +uint64_t x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height ) +{ + uint64_t ssd = pf->ssd_nv12_core( pix1, i_pix1, pix2, i_pix2, i_width&~7, i_height ); + if( i_width&7 ) + ssd += pixel_ssd_nv12_core( pix1+(i_width&~7), i_pix1, pix2+(i_width&~7), i_pix2, i_width&7, i_height ); + return ssd; +} /**************************************************************************** * pixel_var_wxh @@ -669,6 +694,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->var[PIXEL_16x16] = x264_pixel_var_16x16; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8; + pixf->ssd_nv12_core = pixel_ssd_nv12_core; pixf->ssim_4x4x2_core = ssim_4x4x2_core; pixf->ssim_end4 = ssim_end4; pixf->var2_8x8 = pixel_var2_8x8; @@ -702,6 +728,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT_ADS( _mmxext ); pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmxext; + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmxext; #if ARCH_X86 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext; @@ -747,6 +774,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT5( ssd, _sse2slow ); INIT2_NAME( sad_aligned, sad, _sse2_aligned ); pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2; + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2; pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; diff --git a/common/pixel.h b/common/pixel.h index 2c5330e..54bb8d7 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -78,6 +78,8 @@ typedef struct uint64_t (*var[4])( pixel *pix, int stride ); uint64_t (*hadamard_ac[4])( pixel *pix, int stride ); + uint64_t (*ssd_nv12_core)( pixel *pixuv1, int stride1, + pixel *pixuv2, int stride2, int width, int height ); void (*ssim_4x4x2_core)( const pixel *pix1, int stride1, const pixel *pix2, int stride2, int sums[2][4] ); float (*ssim_end4)( int sum0[5][4], int sum1[5][4], int width ); @@ -110,7 +112,8 @@ typedef struct } x264_pixel_function_t; void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ); -int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height ); +uint64_t x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height ); +uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height ); float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height, void *buf ); #endif diff --git a/common/ppc/mc.c b/common/ppc/mc.c index 744a804..24c539b 100644 --- a/common/ppc/mc.c +++ b/common/ppc/mc.c @@ -800,7 +800,7 @@ void x264_mc_altivec_init( x264_mc_functions_t *pf ) #if !X264_HIGH_BIT_DEPTH pf->mc_luma = mc_luma_altivec; pf->get_ref = get_ref_altivec; - pf->mc_chroma = mc_chroma_altivec; +// pf->mc_chroma = mc_chroma_altivec; pf->copy_16x16_unaligned = x264_mc_copy_w16_altivec; pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_altivec; diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index 3a31e26..29bdea9 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -40,71 +40,120 @@ cextern pb_a1 [base], [base+stride], [base+stride*2], [base3], \ [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] -; in: 8 rows of 4 bytes in %1..%8 +%define PASS8ROWS(base, base3, stride, stride3, offset) \ + PASS8ROWS(base+offset, base3+offset, stride, stride3) + +; in: 8 rows of 4 bytes in %4..%11 ; out: 4 rows of 8 bytes in m0..m3 -%macro TRANSPOSE4x8_LOAD 8 - movd m0, %1 - movd m2, %2 - movd m1, %3 - movd m3, %4 - punpcklbw m0, m2 - punpcklbw m1, m3 - movq m2, m0 - punpcklwd m0, m1 - punpckhwd m2, m1 - - movd m4, %5 - movd m6, %6 - movd m5, %7 - movd m7, %8 - punpcklbw m4, m6 - punpcklbw m5, m7 - movq m6, m4 - punpcklwd m4, m5 - punpckhwd m6, m5 - - movq m1, m0 - movq m3, m2 - punpckldq m0, m4 - punpckhdq m1, m4 - punpckldq m2, m6 - punpckhdq m3, m6 +%macro TRANSPOSE4x8_LOAD 11 + movh m0, %4 + movh m2, %5 + movh m1, %6 + movh m3, %7 + punpckl%1 m0, m2 + punpckl%1 m1, m3 + mova m2, m0 + punpckl%2 m0, m1 + punpckh%2 m2, m1 + + movh m4, %8 + movh m6, %9 + movh m5, %10 + movh m7, %11 + punpckl%1 m4, m6 + punpckl%1 m5, m7 + mova m6, m4 + punpckl%2 m4, m5 + punpckh%2 m6, m5 + + mova m1, m0 + mova m3, m2 + punpckl%3 m0, m4 + punpckh%3 m1, m4 + punpckl%3 m2, m6 + punpckh%3 m3, m6 %endmacro ; in: 4 rows of 8 bytes in m0..m3 ; out: 8 rows of 4 bytes in %1..%8 -%macro TRANSPOSE8x4_STORE 8 - movq m4, m0 - movq m5, m1 - movq m6, m2 +%macro TRANSPOSE8x4B_STORE 8 + mova m4, m0 + mova m5, m1 + mova m6, m2 punpckhdq m4, m4 punpckhdq m5, m5 punpckhdq m6, m6 punpcklbw m0, m1 punpcklbw m2, m3 - movq m1, m0 + mova m1, m0 punpcklwd m0, m2 punpckhwd m1, m2 - movd %1, m0 + movh %1, m0 punpckhdq m0, m0 - movd %2, m0 - movd %3, m1 + movh %2, m0 + movh %3, m1 punpckhdq m1, m1 - movd %4, m1 + movh %4, m1 punpckhdq m3, m3 punpcklbw m4, m5 punpcklbw m6, m3 - movq m5, m4 + mova m5, m4 punpcklwd m4, m6 punpckhwd m5, m6 - movd %5, m4 + movh %5, m4 punpckhdq m4, m4 - movd %6, m4 - movd %7, m5 + movh %6, m4 + movh %7, m5 punpckhdq m5, m5 - movd %8, m5 + movh %8, m5 +%endmacro + +%macro TRANSPOSE4x8B_LOAD 8 + TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 +%endmacro + +%macro TRANSPOSE4x8W_LOAD 8 +%if mmsize==16 + TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8 +%else + SWAP 1, 4, 2, 3 + mova m0, [t5] + mova m1, [t5+r1] + mova m2, [t5+r1*2] + mova m3, [t5+t6] + TRANSPOSE4x4W 0, 1, 2, 3, 4 +%endif +%endmacro + +%macro TRANSPOSE8x2W_STORE 8 + mova m0, m1 + punpcklwd m1, m2 + punpckhwd m0, m2 +%if mmsize==8 + movd %1, m1 + movd %3, m0 + psrlq m1, 32 + psrlq m0, 32 + movd %2, m1 + movd %4, m0 +%else + movd %1, m1 + movd %5, m0 + psrldq m1, 4 + psrldq m0, 4 + movd %2, m1 + movd %6, m0 + psrldq m1, 4 + psrldq m0, 4 + movd %3, m1 + movd %7, m0 + psrldq m1, 4 + psrldq m0, 4 + movd %4, m1 + movd %8, m0 +%endif %endmacro %macro SBUTTERFLY3 4 @@ -116,6 +165,7 @@ cextern pb_a1 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] %macro TRANSPOSE6x8_MEM 9 + RESET_MM_PERMUTATION movq m0, %1 movq m1, %2 movq m2, %3 @@ -123,30 +173,32 @@ cextern pb_a1 movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY3 bw, m0, m1, m7 - SBUTTERFLY3 bw, m2, m3, m1 - SBUTTERFLY3 bw, m4, m5, m3 - movq [%9+0x10], m1 - SBUTTERFLY3 bw, m6, %8, m5 - SBUTTERFLY3 wd, m0, m2, m1 - SBUTTERFLY3 wd, m4, m6, m2 + SBUTTERFLY bw, 0, 1, 7 + SBUTTERFLY bw, 2, 3, 7 + SBUTTERFLY bw, 4, 5, 7 + movq [%9+0x10], m3 + SBUTTERFLY3 bw, m6, %8, m7 + SBUTTERFLY wd, 0, 2, 3 + SBUTTERFLY wd, 4, 6, 3 punpckhdq m0, m4 movq [%9+0x00], m0 - SBUTTERFLY3 wd, m7, [%9+0x10], m6 - SBUTTERFLY3 wd, m3, m5, m4 - SBUTTERFLY3 dq, m7, m3, m0 - SBUTTERFLY3 dq, m1, m2, m5 - punpckldq m6, m4 - movq [%9+0x10], m1 - movq [%9+0x20], m5 - movq [%9+0x30], m7 - movq [%9+0x40], m0 - movq [%9+0x50], m6 + SBUTTERFLY3 wd, m1, [%9+0x10], m3 + SBUTTERFLY wd, 5, 7, 0 + SBUTTERFLY dq, 1, 5, 0 + SBUTTERFLY dq, 2, 6, 0 + punpckldq m3, m7 + movq [%9+0x10], m2 + movq [%9+0x20], m6 + movq [%9+0x30], m1 + movq [%9+0x40], m5 + movq [%9+0x50], m3 + RESET_MM_PERMUTATION %endmacro ; in: 8 rows of 8 in %1..%8 ; out: 8 rows of 8 in %9..%16 %macro TRANSPOSE8x8_MEM 16 + RESET_MM_PERMUTATION movq m0, %1 movq m1, %2 movq m2, %3 @@ -154,29 +206,30 @@ cextern pb_a1 movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY3 bw, m0, m1, m7 - SBUTTERFLY3 bw, m2, m3, m1 - SBUTTERFLY3 bw, m4, m5, m3 - SBUTTERFLY3 bw, m6, %8, m5 - movq %9, m3 - SBUTTERFLY3 wd, m0, m2, m3 - SBUTTERFLY3 wd, m4, m6, m2 - SBUTTERFLY3 wd, m7, m1, m6 - movq %11, m2 - movq m2, %9 - SBUTTERFLY3 wd, m2, m5, m1 - SBUTTERFLY3 dq, m0, m4, m5 - SBUTTERFLY3 dq, m7, m2, m4 + SBUTTERFLY bw, 0, 1, 7 + SBUTTERFLY bw, 2, 3, 7 + SBUTTERFLY bw, 4, 5, 7 + SBUTTERFLY3 bw, m6, %8, m7 + movq %9, m5 + SBUTTERFLY wd, 0, 2, 5 + SBUTTERFLY wd, 4, 6, 5 + SBUTTERFLY wd, 1, 3, 5 + movq %11, m6 + movq m6, %9 + SBUTTERFLY wd, 6, 7, 5 + SBUTTERFLY dq, 0, 4, 5 + SBUTTERFLY dq, 1, 6, 5 movq %9, m0 - movq %10, m5 - movq %13, m7 - movq %14, m4 - SBUTTERFLY3 dq, m3, %11, m0 - SBUTTERFLY3 dq, m6, m1, m5 - movq %11, m3 + movq %10, m4 + movq %13, m1 + movq %14, m6 + SBUTTERFLY3 dq, m2, %11, m0 + SBUTTERFLY dq, 3, 7, 4 + movq %11, m2 movq %12, m0 - movq %15, m6 - movq %16, m5 + movq %15, m3 + movq %16, m7 + RESET_MM_PERMUTATION %endmacro ; out: %4 = |%1-%2|>%3 @@ -365,7 +418,7 @@ cglobal deblock_h_luma_sse2, 5,7 movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) shl r10, 3 sub r6, r10 @@ -375,7 +428,7 @@ cglobal deblock_h_luma_sse2, 5,7 movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) %ifdef WIN64 add rsp, 0x98 @@ -484,7 +537,7 @@ cglobal deblock_h_luma_%1, 0,5 movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) lea r0, [r0+r3*8] lea r1, [r1+r3*8] @@ -492,7 +545,7 @@ cglobal deblock_h_luma_%1, 0,5 movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) ADD esp, pad RET @@ -767,117 +820,152 @@ DEBLOCK_LUMA_INTRA mmxext, v8 -INIT_MMX - %macro CHROMA_V_START 0 dec r2d ; alpha-1 dec r3d ; beta-1 mov t5, r0 sub t5, r1 sub t5, r1 +%if mmsize==8 + mov dword r0m, 2 +.skip_prologue: +%endif %endmacro %macro CHROMA_H_START 0 dec r2d dec r3d - sub r0, 2 + sub r0, 4 lea t6, [r1*3] mov t5, r0 add r0, t6 +%if mmsize==8 + mov dword r0m, 2 +.skip_prologue: +%endif +%endmacro + +%macro CHROMA_V_LOOP 1 +%if mmsize==8 + add r0, 8 + add t5, 8 +%if %1 + add r4, 2 +%endif + dec dword r0m + jg .skip_prologue +%endif +%endmacro + +%macro CHROMA_H_LOOP 1 +%if mmsize==8 + lea r0, [r0+r1*4] + lea t5, [t5+r1*4] +%if %1 + add r4, 2 +%endif + dec dword r0m + jg .skip_prologue +%endif %endmacro %define t5 r5 %define t6 r6 +%macro DEBLOCK_CHROMA 1 ;----------------------------------------------------------------------------- ; void deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_v_chroma_mmxext, 5,6 +cglobal deblock_v_chroma_%1, 5,6,8 CHROMA_V_START - movq m0, [t5] - movq m1, [t5+r1] - movq m2, [r0] - movq m3, [r0+r1] - call chroma_inter_body_mmxext - movq [t5+r1], m1 - movq [r0], m2 + mova m0, [t5] + mova m1, [t5+r1] + mova m2, [r0] + mova m3, [r0+r1] + call chroma_inter_body_%1 + mova [t5+r1], m1 + mova [r0], m2 + CHROMA_V_LOOP 1 RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_h_chroma_mmxext, 5,7 -%ifdef ARCH_X86_64 - %define buf0 [rsp-24] - %define buf1 [rsp-16] -%else - %define buf0 r0m - %define buf1 r2m -%endif +cglobal deblock_h_chroma_%1, 5,7,8 CHROMA_H_START - TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) - movq buf0, m0 - movq buf1, m3 - call chroma_inter_body_mmxext - movq m0, buf0 - movq m3, buf1 - TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) + call chroma_inter_body_%1 + TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) + CHROMA_H_LOOP 1 RET ALIGN 16 -chroma_inter_body_mmxext: +RESET_MM_PERMUTATION +chroma_inter_body_%1: LOAD_MASK r2d, r3d movd m6, [r4] ; tc0 punpcklbw m6, m6 + punpcklbw m6, m6 pand m7, m6 DEBLOCK_P0_Q0 ret +%endmacro ; DEBLOCK_CHROMA +INIT_XMM +DEBLOCK_CHROMA sse2 +%ifndef ARCH_X86_64 +INIT_MMX +DEBLOCK_CHROMA mmxext +%endif ; in: %1=p0 %2=p1 %3=q1 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 %macro CHROMA_INTRA_P0 3 - movq m4, %1 + mova m4, %1 pxor m4, %3 pand m4, [pb_1] ; m4 = (p0^q1)&1 pavgb %1, %3 psubusb %1, m4 - pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) + pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) %endmacro %define t5 r4 %define t6 r5 +%macro DEBLOCK_CHROMA_INTRA 1 ;----------------------------------------------------------------------------- ; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_v_chroma_intra_mmxext, 4,5 +cglobal deblock_v_chroma_intra_%1, 4,5,8 CHROMA_V_START - movq m0, [t5] - movq m1, [t5+r1] - movq m2, [r0] - movq m3, [r0+r1] - call chroma_intra_body_mmxext - movq [t5+r1], m1 - movq [r0], m2 + mova m0, [t5] + mova m1, [t5+r1] + mova m2, [r0] + mova m3, [r0+r1] + call chroma_intra_body_%1 + mova [t5+r1], m1 + mova [r0], m2 + CHROMA_V_LOOP 0 RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_chroma_intra_mmxext, 4,6 +cglobal deblock_h_chroma_intra_%1, 4,6,8 CHROMA_H_START - TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) - call chroma_intra_body_mmxext - TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) + call chroma_intra_body_%1 + TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) + CHROMA_H_LOOP 0 RET ALIGN 16 -chroma_intra_body_mmxext: +RESET_MM_PERMUTATION +chroma_intra_body_%1: LOAD_MASK r2d, r3d - movq m5, m1 - movq m6, m2 + mova m5, m1 + mova m6, m2 CHROMA_INTRA_P0 m1, m0, m3 CHROMA_INTRA_P0 m2, m3, m0 psubb m1, m5 @@ -887,6 +975,16 @@ chroma_intra_body_mmxext: paddb m1, m5 paddb m2, m6 ret +%endmacro ; DEBLOCK_CHROMA_INTRA + +INIT_XMM +DEBLOCK_CHROMA_INTRA sse2 +%ifndef ARCH_X86_64 +INIT_MMX +DEBLOCK_CHROMA_INTRA mmxext +%endif + + ;----------------------------------------------------------------------------- ; static void deblock_strength( uint8_t nnz[48], int8_t ref[2][40], int16_t mv[2][40][2], diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index f9347ba..ea70e41 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -28,15 +28,19 @@ SECTION_RODATA 32 -ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0 +ch_shuf: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 +ch_shuf_adj: times 8 db 0 + times 8 db 2 + times 8 db 4 + times 8 db 6 SECTION .text -cextern pw_1 cextern pw_4 cextern pw_8 cextern pw_32 cextern pw_64 +cextern pw_00ff cextern sw_64 ;============================================================================= @@ -896,28 +900,27 @@ COPY_W16_SSE2 mc_copy_w16_aligned_sse2, movdqa ;----------------------------------------------------------------------------- %ifdef ARCH_X86_64 cglobal prefetch_fenc_mmxext, 5,5 + and r4d, 3 mov eax, r4d - and eax, 3 - imul eax, r1d - lea r0, [r0+rax*4+64] + imul r4d, r1d + lea r0, [r0+r4*4+64] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] prefetcht0 [r0] prefetcht0 [r0+r1] - and r4d, 6 - imul r4d, r3d - lea r2, [r2+r4+64] + imul eax, r3d + lea r2, [r2+rax*2+64] prefetcht0 [r2] prefetcht0 [r2+r3] RET %else -cglobal prefetch_fenc_mmxext - mov r2, [esp+20] - mov r1, [esp+8] - mov r0, [esp+4] +cglobal prefetch_fenc_mmxext, 0,3 + mov r2, r4m + mov r1, r1m + mov r0, r0m and r2, 3 imul r2, r1 lea r0, [r0+r2*4+64] @@ -927,12 +930,12 @@ cglobal prefetch_fenc_mmxext prefetcht0 [r0] prefetcht0 [r0+r1] - mov r2, [esp+20] - mov r1, [esp+16] - mov r0, [esp+12] - and r2, 6 + mov r2, r4m + mov r1, r3m + mov r0, r2m + and r2, 3 imul r2, r1 - lea r0, [r0+r2+64] + lea r0, [r0+r2*2+64] prefetcht0 [r0] prefetcht0 [r0+r1] ret @@ -963,237 +966,380 @@ cglobal prefetch_ref_mmxext, 3,3 ; chroma MC ;============================================================================= - %define t0 rax %ifdef ARCH_X86_64 - %define t1 r10 + DECLARE_REG_TMP 10,11,6 %else - %define t1 r1 + DECLARE_REG_TMP 0,1,2 %endif %macro MC_CHROMA_START 0 - movifnidn r2, r2mp - movifnidn r3d, r3m + movifnidn r3, r3mp movifnidn r4d, r4m movifnidn r5d, r5m - mov t0d, r5d - mov t1d, r4d + movifnidn t2d, r6m + mov t0d, t2d + mov t1d, r5d sar t0d, 3 sar t1d, 3 - imul t0d, r3d - add t0d, t1d + imul t0d, r4d + lea t0d, [t0+t1*2] movsxdifnidn t0, t0d - add r2, t0 ; src += (dx>>3) + (dy>>3) * src_stride + add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride +%endmacro + +%macro UNPACK_UNALIGNED_MEM 3 + punpcklwd %1, %3 +%endmacro + +%macro UNPACK_UNALIGNED_LOAD 3 + movh %2, %3 + punpcklwd %1, %2 %endmacro ;----------------------------------------------------------------------------- -; void mc_chroma( uint8_t *dst, int dst_stride, +; void mc_chroma( uint8_t *dstu, uint8_t *dstv, int dst_stride, ; uint8_t *src, int src_stride, ; int dx, int dy, ; int width, int height ) ;----------------------------------------------------------------------------- -%macro MC_CHROMA 1-2 0 -cglobal mc_chroma_%1 -%if mmsize == 16 - cmp dword r6m, 4 - jle mc_chroma_mmxext -%endif - PROLOGUE 0,6,%2 +%macro MC_CHROMA 1 +cglobal mc_chroma_%1, 0,6 MC_CHROMA_START - pxor m3, m3 - and r4d, 7 ; dx &= 7 + and r5d, 7 +%ifdef ARCH_X86_64 jz .mc1dy - and r5d, 7 ; dy &= 7 - jz .mc1dx - - movd m5, r4d - movd m6, r5d - SPLATW m5, m5 ; m5 = dx - SPLATW m6, m6 ; m6 = dy - - mova m4, [pw_8] - mova m0, m4 - psubw m4, m5 ; m4 = 8-dx - psubw m0, m6 ; m0 = 8-dy - - mova m7, m5 - pmullw m5, m0 ; m5 = dx*(8-dy) = cB - pmullw m7, m6 ; m7 = dx*dy = cD - pmullw m6, m4 ; m6 = (8-dx)*dy = cC - pmullw m4, m0 ; m4 = (8-dx)*(8-dy) = cA - - mov r4d, r7m +%endif + and t2d, 7 %ifdef ARCH_X86_64 - mov r10, r0 - mov r11, r2 + jz .mc1dx +%endif + shl r5d, 16 + add t2d, r5d + mov t0d, t2d + shl t2d, 8 + sub t2d, t0d + add t2d, 0x80008 ; (x<<24) + ((8-x)<<16) + (y<<8) + (8-y) + cmp dword r7m, 4 +%if mmsize==8 +.skip_prologue: %else - mov r0, r0mp - mov r1, r1m - mov r5, r2 + jl mc_chroma_mmxext %+ .skip_prologue + WIN64_SPILL_XMM 9 %endif - -.loop2d: - movh m1, [r2+r3] - movh m0, [r2] - punpcklbw m1, m3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4 - punpcklbw m0, m3 - pmullw m1, m6 ; 2nd line * cC - pmullw m0, m4 ; 1st line * cA - paddw m0, m1 ; m0 <- result - - movh m2, [r2+1] - movh m1, [r2+r3+1] - punpcklbw m2, m3 - punpcklbw m1, m3 - - paddw m0, [pw_32] - - pmullw m2, m5 ; line * cB - pmullw m1, m7 ; line * cD + movd m5, t2d + movifnidn r0, r0mp + movifnidn r1, r1mp + movifnidn r2d, r2m + movifnidn r5d, r8m + pxor m6, m6 + punpcklbw m5, m6 +%if mmsize==8 + pshufw m7, m5, 0xee + pshufw m6, m5, 0x00 + pshufw m5, m5, 0x55 + jge .width4 +%else +%ifdef WIN64 + cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM +%endif + pshufd m7, m5, 0x55 + punpcklwd m5, m5 + pshufd m6, m5, 0x00 + pshufd m5, m5, 0x55 + jg .width8 +%endif + movu m0, [r3] + UNPACK_UNALIGNED m0, m1, [r3+2] + mova m1, m0 + pand m0, [pw_00ff] + psrlw m1, 8 + pmaddwd m0, m7 + pmaddwd m1, m7 + packssdw m0, m1 + SWAP m3, m0 +ALIGN 4 +.loop2: + movu m0, [r3+r4] + UNPACK_UNALIGNED m0, m1, [r3+r4+2] + pmullw m3, m6 + mova m1, m0 + pand m0, [pw_00ff] + psrlw m1, 8 + pmaddwd m0, m7 + pmaddwd m1, m7 + mova m2, [pw_32] + packssdw m0, m1 + paddw m2, m3 + mova m3, m0 + pmullw m0, m5 paddw m0, m2 - paddw m0, m1 psrlw m0, 6 + packuswb m0, m0 + movd [r0], m0 +%if mmsize==8 + psrlq m0, 16 +%else + psrldq m0, 4 +%endif + movd [r1], m0 + add r3, r4 + add r0, r2 + add r1, r2 + dec r5d + jg .loop2 + REP_RET - packuswb m0, m3 ; 00 00 00 00 px1 px2 px3 px4 - movh [r0], m0 - - add r2, r3 - add r0, r1 ; dst_stride - dec r4d - jnz .loop2d - -%if mmsize == 8 - sub dword r6m, 8 - jnz .finish ; width != 8 so assume 4 +%if mmsize==8 +.width4: %ifdef ARCH_X86_64 - lea r0, [r10+4] ; dst - lea r2, [r11+4] ; src + mov t0, r0 + mov t1, r1 + mov t2, r3 + %define multy0 [rsp-8] + mova multy0, m5 %else - mov r0, r0mp - lea r2, [r5+4] - add r0, 4 + mov r3m, r3 + %define multy0 r4m + mova multy0, m5 %endif - mov r4d, r7m ; height - jmp .loop2d %else +.width8: +%ifdef ARCH_X86_64 + %define multy0 m8 + SWAP m8, m5 +%else + %define multy0 r0m + mova multy0, m5 +%endif +%endif +.loopx: + movu m0, [r3] + movu m1, [r3+mmsize/2] + UNPACK_UNALIGNED m0, m2, [r3+2] + UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2] + mova m2, m0 + mova m3, m1 + pand m0, [pw_00ff] + pand m1, [pw_00ff] + psrlw m2, 8 + psrlw m3, 8 + pmaddwd m0, m7 + pmaddwd m2, m7 + pmaddwd m1, m7 + pmaddwd m3, m7 + packssdw m0, m2 + packssdw m1, m3 + SWAP m4, m0 + SWAP m5, m1 + add r3, r4 +ALIGN 4 +.loop4: + movu m0, [r3] + movu m1, [r3+mmsize/2] + UNPACK_UNALIGNED m0, m2, [r3+2] + UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2] + mova m2, m0 + mova m3, m1 + pand m0, [pw_00ff] + pand m1, [pw_00ff] + psrlw m2, 8 + psrlw m3, 8 + pmaddwd m0, m7 + pmaddwd m2, m7 + pmaddwd m1, m7 + pmaddwd m3, m7 + packssdw m0, m2 + packssdw m1, m3 + pmullw m4, m6 + pmullw m5, m6 + mova m2, [pw_32] + mova m3, m2 + paddw m2, m4 + paddw m3, m5 + mova m4, m0 + mova m5, m1 + pmullw m0, multy0 + pmullw m1, multy0 + paddw m0, m2 + paddw m1, m3 + psrlw m0, 6 + psrlw m1, 6 + packuswb m0, m1 +%if mmsize==8 + pshufw m1, m0, 0x8 + pshufw m0, m0, 0xd + movd [r0], m1 + movd [r1], m0 +%else + pshufd m0, m0, 0xd8 + movq [r0], m0 + movhps [r1], m0 +%endif + add r3, r4 + add r0, r2 + add r1, r2 + dec r5d + jg .loop4 +%if mmsize!=8 + REP_RET +%else + sub dword r7m, 4 + jg .width8 REP_RET -%endif ; mmsize +.width8: +%ifdef ARCH_X86_64 + lea r3, [t2+8] + lea r0, [t0+4] + lea r1, [t1+4] +%else + mov r3, r3m + mov r0, r0m + mov r1, r1m + add r3, 8 + add r0, 4 + add r1, 4 +%endif + mov r5d, r8m + jmp .loopx +%endif +%ifdef ARCH_X86_64 ; too many regs for x86_32 + RESET_MM_PERMUTATION +%ifdef WIN64 +%if xmm_regs_used > 6 + %assign stack_offset stack_offset-(xmm_regs_used-6)*16-16 + %assign xmm_regs_used 6 +%endif +%endif .mc1dy: - and r5d, 7 - movd m6, r5d - mov r5, r3 ; pel_offset = dx ? 1 : src_stride + and t2d, 7 + movd m5, t2d + mov r6d, r4d ; pel_offset = dx ? 2 : src_stride jmp .mc1d .mc1dx: - movd m6, r4d - mov r5d, 1 + movd m5, r5d + mov r6d, 2 .mc1d: - mova m5, [pw_8] - SPLATW m6, m6 - mova m7, [pw_4] - psubw m5, m6 - movifnidn r0, r0mp - movifnidn r1d, r1m - mov r4d, r7m -%if mmsize == 8 - cmp dword r6m, 8 - je .loop1d_w8 + mova m4, [pw_8] + SPLATW m5, m5 + psubw m4, m5 + movifnidn r0, r0mp + movifnidn r1, r1mp + movifnidn r2d, r2m + movifnidn r5d, r8m + cmp dword r7m, 4 + jg .mc1d_w8 + mov r10, r2 + mov r11, r4 +%if mmsize!=8 + shr r5d, 1 %endif - .loop1d_w4: - movh m0, [r2+r5] - movh m1, [r2] - punpcklbw m0, m3 - punpcklbw m1, m3 - pmullw m0, m6 - pmullw m1, m5 - paddw m0, m7 - paddw m0, m1 - psrlw m0, 3 - packuswb m0, m3 - movh [r0], m0 - add r2, r3 - add r0, r1 - dec r4d - jnz .loop1d_w4 -.finish: - REP_RET - -%if mmsize == 8 -.loop1d_w8: - movu m0, [r2+r5] - mova m1, [r2] + movq m0, [r3] + movq m1, [r3+r6] +%if mmsize!=8 + add r3, r11 + movhps m0, [r3] + movhps m1, [r3+r6] +%endif mova m2, m0 - mova m4, m1 - punpcklbw m0, m3 - punpcklbw m1, m3 - punpckhbw m2, m3 - punpckhbw m4, m3 - pmullw m0, m6 + mova m3, m1 + pand m0, [pw_00ff] + pand m1, [pw_00ff] + psrlw m2, 8 + psrlw m3, 8 + pmullw m0, m4 pmullw m1, m5 - pmullw m2, m6 - pmullw m4, m5 - paddw m0, m7 - paddw m2, m7 + pmullw m2, m4 + pmullw m3, m5 + paddw m0, [pw_4] + paddw m2, [pw_4] paddw m0, m1 - paddw m2, m4 + paddw m2, m3 psrlw m0, 3 psrlw m2, 3 packuswb m0, m2 - mova [r0], m0 - add r2, r3 - add r0, r1 - dec r4d - jnz .loop1d_w8 +%if mmsize==8 + xchg r4, r11 + xchg r2, r10 + movd [r0], m0 + psrlq m0, 32 + movd [r1], m0 +%else + movhlps m1, m0 + movd [r0], m0 + movd [r1], m1 + add r0, r10 + add r1, r10 + psrldq m0, 4 + psrldq m1, 4 + movd [r0], m0 + movd [r1], m1 +%endif + add r3, r4 + add r0, r2 + add r1, r2 + dec r5d + jg .loop1d_w4 REP_RET -%endif ; mmsize +.mc1d_w8: + sub r2, 4 + sub r4, 8 + mov r10, 4 + mov r11, 8 +%if mmsize==8 + shl r5d, 1 +%endif + jmp .loop1d_w4 +%endif ; ARCH_X86_64 %endmacro ; MC_CHROMA -INIT_MMX -MC_CHROMA mmxext -INIT_XMM -MC_CHROMA sse2, 8 -%macro MC_CHROMA_SSSE3 2 -INIT_MMX -cglobal mc_chroma_ssse3%1, 0,6,%2 +%macro MC_CHROMA_SSSE3 0-1 +INIT_XMM +cglobal mc_chroma_ssse3%1, 0,6,9 MC_CHROMA_START - and r4d, 7 and r5d, 7 - mov t0d, r4d + and t2d, 7 + mov t0d, r5d shl t0d, 8 - sub t0d, r4d - mov r4d, 8 + sub t0d, r5d + mov r5d, 8 add t0d, 8 - sub r4d, r5d - imul r5d, t0d ; (x*255+8)*y - imul r4d, t0d ; (x*255+8)*(8-y) - cmp dword r6m, 4 - jg .width8 - mova m5, [pw_32] - movd m6, r5d - movd m7, r4d - movifnidn r0, r0mp - movifnidn r1d, r1m - movifnidn r4d, r7m - SPLATW m6, m6 - SPLATW m7, m7 - mov r5, r2 - and r2, ~3 - and r5, 3 + sub r5d, t2d + imul t2d, t0d ; (x*255+8)*y + imul r5d, t0d ; (x*255+8)*(8-y) + movd m6, t2d + movd m7, r5d +%ifidn %1, _cache64 + mov t0d, r3d + and t0d, 7 %ifdef PIC - lea r11, [ch_shuffle] - movu m5, [r11 + r5*2] + lea t1, [ch_shuf_adj] + movddup m5, [t1 + t0*4] %else - movu m5, [ch_shuffle + r5*2] + movddup m5, [ch_shuf_adj + t0*4] %endif - movu m0, [r2] + paddb m5, [ch_shuf] + and r3, ~7 +%else + mova m5, [ch_shuf] +%endif + movifnidn r0, r0mp + movifnidn r1, r1mp + movifnidn r2d, r2m + movifnidn r5d, r8m + SPLATW m6, m6 + SPLATW m7, m7 + cmp dword r7m, 4 + jg .width8 + movu m0, [r3] pshufb m0, m5 .loop4: - movu m1, [r2+r3] + movu m1, [r3+r4] pshufb m1, m5 - movu m3, [r2+2*r3] + movu m3, [r3+r4*2] pshufb m3, m5 - lea r2, [r2+2*r3] mova m2, m1 mova m4, m3 pmaddubsw m0, m7 @@ -1207,109 +1353,90 @@ cglobal mc_chroma_ssse3%1, 0,6,%2 mova m0, m4 psrlw m1, 6 psrlw m3, 6 - packuswb m1, m1 - packuswb m3, m3 - movh [r0], m1 - movh [r0+r1], m3 - sub r4d, 2 - lea r0, [r0+2*r1] + packuswb m1, m3 + movhlps m3, m1 + movd [r0], m1 + movd [r0+r2], m3 + psrldq m1, 4 + psrldq m3, 4 + movd [r1], m1 + movd [r1+r2], m3 + lea r3, [r3+r4*2] + lea r0, [r0+r2*2] + lea r1, [r1+r2*2] + sub r5d, 2 jg .loop4 REP_RET -INIT_XMM .width8: - movd m6, r5d - movd m7, r4d - movifnidn r0, r0mp - movifnidn r1d, r1m - movifnidn r4d, r7m - SPLATW m6, m6 - SPLATW m7, m7 -%ifidn %1, _cache64 - mov r5, r2 - and r5, 0x3f - cmp r5, 0x38 - jge .split -%endif - mova m5, [pw_32] - movh m0, [r2] - movh m1, [r2+1] - punpcklbw m0, m1 -.loop8: - movh m1, [r2+1*r3] - movh m2, [r2+1*r3+1] - movh m3, [r2+2*r3] - movh m4, [r2+2*r3+1] - punpcklbw m1, m2 - punpcklbw m3, m4 - lea r2, [r2+2*r3] - mova m2, m1 - mova m4, m3 - pmaddubsw m0, m7 - pmaddubsw m1, m6 - pmaddubsw m2, m7 - pmaddubsw m3, m6 - paddw m0, m5 - paddw m2, m5 - paddw m1, m0 - paddw m3, m2 - mova m0, m4 - psrlw m1, 6 - psrlw m3, 6 - packuswb m1, m3 - movh [r0], m1 - movhps [r0+r1], m1 - sub r4d, 2 - lea r0, [r0+2*r1] - jg .loop8 - REP_RET -%ifidn %1, _cache64 -.split: - and r2, ~7 - and r5, 7 -%ifdef PIC - lea r11, [ch_shuffle] - movu m5, [r11 + r5*2] -%else - movu m5, [ch_shuffle + r5*2] -%endif - movu m0, [r2] + movu m0, [r3] pshufb m0, m5 + movu m1, [r3+8] + pshufb m1, m5 %ifdef ARCH_X86_64 - mova m8, [pw_32] - %define round m8 + SWAP m8, m6 + %define mult1 m8 %else - %define round [pw_32] + mova r0m, m6 + %define mult1 r0m %endif -.splitloop8: - movu m1, [r2+r3] - pshufb m1, m5 - movu m3, [r2+2*r3] +.loop8: + movu m2, [r3+r4] + pshufb m2, m5 + movu m3, [r3+r4+8] pshufb m3, m5 - lea r2, [r2+2*r3] - mova m2, m1 - mova m4, m3 + mova m4, m2 + mova m6, m3 pmaddubsw m0, m7 - pmaddubsw m1, m6 - pmaddubsw m2, m7 - pmaddubsw m3, m6 - paddw m0, round - paddw m2, round - paddw m1, m0 - paddw m3, m2 - mova m0, m4 + pmaddubsw m1, m7 + pmaddubsw m2, mult1 + pmaddubsw m3, mult1 + paddw m0, [pw_32] + paddw m1, [pw_32] + paddw m0, m2 + paddw m1, m3 + psrlw m0, 6 psrlw m1, 6 + packuswb m0, m1 + pshufd m0, m0, 0xd8 + movq [r0], m0 + movhps [r1], m0 + + movu m2, [r3+r4*2] + pshufb m2, m5 + movu m3, [r3+r4*2+8] + pshufb m3, m5 + mova m0, m2 + mova m1, m3 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pmaddubsw m2, mult1 + pmaddubsw m3, mult1 + paddw m4, [pw_32] + paddw m6, [pw_32] + paddw m2, m4 + paddw m3, m6 + psrlw m2, 6 psrlw m3, 6 - packuswb m1, m3 - movh [r0], m1 - movhps [r0+r1], m1 - sub r4d, 2 - lea r0, [r0+2*r1] - jg .splitloop8 + packuswb m2, m3 + pshufd m2, m2, 0xd8 + movq [r0+r2], m2 + movhps [r1+r2], m2 + lea r3, [r3+r4*2] + lea r0, [r0+r2*2] + lea r1, [r1+r2*2] + sub r5d, 2 + jg .loop8 REP_RET -%endif -; mc_chroma 1d ssse3 is negligibly faster, and definitely not worth the extra code size %endmacro -MC_CHROMA_SSSE3 , 8 -MC_CHROMA_SSSE3 _cache64, 9 +INIT_MMX +%define UNPACK_UNALIGNED UNPACK_UNALIGNED_MEM +MC_CHROMA mmxext +INIT_XMM +MC_CHROMA sse2_misalign +%define UNPACK_UNALIGNED UNPACK_UNALIGNED_LOAD +MC_CHROMA sse2 +MC_CHROMA_SSSE3 +MC_CHROMA_SSSE3 _cache64 + diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index 368497b..2e58bda 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -31,15 +31,17 @@ SECTION_RODATA filt_mul20: times 16 db 20 filt_mul15: times 8 db 1, -5 filt_mul51: times 8 db -5, 1 -hpel_shuf: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 +deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 SECTION .text cextern pw_1 cextern pw_16 cextern pw_32 -cextern pd_128 +cextern pw_00ff cextern pw_3fff +cextern pd_128 %macro LOAD_ADD 4 movh %4, %3 @@ -171,7 +173,7 @@ cglobal hpel_filter_v_%1, 5,6,%2 mova [r2+r4*2], m1 mova [r2+r4*2+mmsize], m4 FILT_PACK m1, m4, 5, m7 - movnt [r0+r4], m1 + movnta [r0+r4], m1 add r1, mmsize add r5, mmsize add r4, mmsize @@ -689,6 +691,213 @@ cglobal plane_copy_core_mmxext, 6,7 RET +%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint + movq m0, [%2] +%if mmsize==16 +%if %4 + punpcklbw m0, [%3] +%else + movq m1, [%3] + punpcklbw m0, m1 +%endif + mov%5a [%1], m0 +%else + movq m1, [%3] + mova m2, m0 + punpcklbw m0, m1 + punpckhbw m2, m1 + mov%5a [%1], m0 + mov%5a [%1+8], m2 +%endif +%endmacro + +%macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, cpu, shuffle constant +%if mmsize==16 + mova m0, [%3] +%ifidn %5, ssse3 + pshufb m0, %6 +%else + mova m1, m0 + pand m0, %6 + psrlw m1, 8 + packuswb m0, m1 +%endif +%if %4 + mova [%1], m0 +%else + movq [%1], m0 + movhps [%2], m0 +%endif +%else + mova m0, [%3] + mova m1, [%3+8] + mova m2, m0 + mova m3, m1 + pand m0, %6 + pand m1, %6 + psrlw m2, 8 + psrlw m3, 8 + packuswb m0, m1 + packuswb m2, m3 + mova [%1], m0 + mova [%2], m2 +%endif +%endmacro + +%macro PLANE_INTERLEAVE 1 +;----------------------------------------------------------------------------- +; void plane_copy_interleave_core( uint8_t *dst, int i_dst, +; uint8_t *srcu, int i_srcu, +; uint8_t *srcv, int i_srcv, int w, int h ) +;----------------------------------------------------------------------------- +; assumes i_dst and w are multiples of 16, and i_dst>2*w +cglobal plane_copy_interleave_core_%1, 6,7 + mov r6d, r6m + movsxdifnidn r1, r1d + movsxdifnidn r3, r3d + movsxdifnidn r5, r5d + lea r0, [r0+r6*2] + add r2, r6 + add r4, r6 +%ifdef ARCH_X86_64 + DECLARE_REG_TMP 10,11 +%else + DECLARE_REG_TMP 1,3 +%endif + mov t0d, r7m + mov t1d, r1d + shr t1d, 1 + sub t1d, r6d +.loopy: + mov r6d, r6m + neg r6 +.prefetch: + prefetchnta [r2+r6] + prefetchnta [r4+r6] + add r6, 64 + jl .prefetch + mov r6d, r6m + neg r6 +.loopx: + INTERLEAVE r0+r6*2, r2+r6, r4+r6, 0, nt + INTERLEAVE r0+r6*2+16, r2+r6+8, r4+r6+8, 0, nt + add r6, 16 + jl .loopx +.pad: +%if mmsize==8 + movntq [r0+r6*2], m0 + movntq [r0+r6*2+8], m0 + movntq [r0+r6*2+16], m0 + movntq [r0+r6*2+24], m0 +%else + movntdq [r0+r6*2], m0 + movntdq [r0+r6*2+16], m0 +%endif + add r6, 16 + cmp r6, t1 + jl .pad + add r0, r1mp + add r2, r3mp + add r4, r5 + dec t0d + jg .loopy + sfence + emms + RET + +;----------------------------------------------------------------------------- +; void store_interleave_8x8x2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv ) +;----------------------------------------------------------------------------- +cglobal store_interleave_8x8x2_%1, 4,5 + mov r4d, 4 +.loop: + INTERLEAVE r0, r2, r3, 1 + INTERLEAVE r0+r1, r2+FDEC_STRIDE, r3+FDEC_STRIDE, 1 + add r2, FDEC_STRIDE*2 + add r3, FDEC_STRIDE*2 + lea r0, [r0+r1*2] + dec r4d + jg .loop + REP_RET +%endmacro ; PLANE_INTERLEAVE + +%macro DEINTERLEAVE_START 1 +%ifidn %1, ssse3 + mova m4, [deinterleave_shuf] +%else + mova m4, [pw_00ff] +%endif +%endmacro + +%macro PLANE_DEINTERLEAVE 1 +;----------------------------------------------------------------------------- +; void plane_copy_deinterleave( uint8_t *dstu, int i_dstu, +; uint8_t *dstv, int i_dstv, +; uint8_t *src, int i_src, int w, int h ) +;----------------------------------------------------------------------------- +cglobal plane_copy_deinterleave_%1, 6,7 + DEINTERLEAVE_START %1 + mov r6d, r6m + movsxdifnidn r1, r1d + movsxdifnidn r3, r3d + movsxdifnidn r5, r5d + add r0, r6 + add r2, r6 + lea r4, [r4+r6*2] +.loopy: + mov r6d, r6m + neg r6 +.loopx: + DEINTERLEAVE r0+r6, r2+r6, r4+r6*2, 0, %1, m4 + DEINTERLEAVE r0+r6+8, r2+r6+8, r4+r6*2+16, 0, %1, m4 + add r6, 16 + jl .loopx + add r0, r1 + add r2, r3 + add r4, r5 + dec dword r7m + jg .loopy + REP_RET + +;----------------------------------------------------------------------------- +; void load_deinterleave_8x8x2_fenc( uint8_t *dst, uint8_t *src, int i_src ) +;----------------------------------------------------------------------------- +cglobal load_deinterleave_8x8x2_fenc_%1, 3,4 + DEINTERLEAVE_START %1 + mov r3d, 4 +.loop: + DEINTERLEAVE r0, r0+FENC_STRIDE/2, r1, 1, %1, m4 + DEINTERLEAVE r0+FENC_STRIDE, r0+FENC_STRIDE*3/2, r1+r2, 1, %1, m4 + add r0, FENC_STRIDE*2 + lea r1, [r1+r2*2] + dec r3d + jg .loop + REP_RET + +;----------------------------------------------------------------------------- +; void load_deinterleave_8x8x2_fdec( uint8_t *dst, uint8_t *src, int i_src ) +;----------------------------------------------------------------------------- +cglobal load_deinterleave_8x8x2_fdec_%1, 3,4 + DEINTERLEAVE_START %1 + mov r3d, 4 +.loop: + DEINTERLEAVE r0, r0+FDEC_STRIDE/2, r1, 0, %1, m4 + DEINTERLEAVE r0+FDEC_STRIDE, r0+FDEC_STRIDE*3/2, r1+r2, 0, %1, m4 + add r0, FDEC_STRIDE*2 + lea r1, [r1+r2*2] + dec r3d + jg .loop + REP_RET +%endmacro ; PLANE_DEINTERLEAVE + +INIT_MMX +PLANE_INTERLEAVE mmxext +PLANE_DEINTERLEAVE mmx +INIT_XMM +PLANE_INTERLEAVE sse2 +PLANE_DEINTERLEAVE sse2 +PLANE_DEINTERLEAVE ssse3 + ; These functions are not general-use; not only do the SSE ones require aligned input, ; but they also will fail if given a non-mod16 size or a size less than 64. diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 4bb5f33..ff1e0ab 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -76,20 +76,34 @@ void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int ); void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int ); void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int ); void x264_prefetch_ref_mmxext( uint8_t *, int, int ); -void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); -void x264_mc_chroma_sse2( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); -void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); -void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, - int dx, int dy, int i_width, int i_height ); void x264_plane_copy_core_mmxext( uint8_t *, int, uint8_t *, int, int w, int h); -void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h); +void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h ); +void x264_plane_copy_interleave_core_mmxext( uint8_t *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ); +void x264_plane_copy_interleave_core_sse2( uint8_t *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ); +void x264_plane_copy_interleave_c( uint8_t *dst, int i_dst, + uint8_t *srcu, int i_srcu, + uint8_t *srcv, int i_srcv, int w, int h ); +void x264_plane_copy_deinterleave_mmx( uint8_t *dstu, int i_dstu, + uint8_t *dstv, int i_dstv, + uint8_t *src, int i_src, int w, int h ); +void x264_plane_copy_deinterleave_sse2( uint8_t *dstu, int i_dstu, + uint8_t *dstv, int i_dstv, + uint8_t *src, int i_src, int w, int h ); +void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, int i_dstu, + uint8_t *dstv, int i_dstv, + uint8_t *src, int i_src, int w, int h ); +void x264_store_interleave_8x8x2_mmxext( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv ); +void x264_store_interleave_8x8x2_sse2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv ); +void x264_load_deinterleave_8x8x2_fenc_mmx( uint8_t *dst, uint8_t *src, int i_src ); +void x264_load_deinterleave_8x8x2_fenc_sse2( uint8_t *dst, uint8_t *src, int i_src ); +void x264_load_deinterleave_8x8x2_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src ); +void x264_load_deinterleave_8x8x2_fdec_mmx( uint8_t *dst, uint8_t *src, int i_src ); +void x264_load_deinterleave_8x8x2_fdec_sse2( uint8_t *dst, uint8_t *src, int i_src ); +void x264_load_deinterleave_8x8x2_fdec_ssse3( uint8_t *dst, uint8_t *src, int i_src ); void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n ); void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n ); void x264_memzero_aligned_mmx( void * dst, int n ); @@ -103,6 +117,17 @@ void x264_integral_init8v_sse2( uint16_t *sum8, int stride ); void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride ); void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, int len ); + +#define MC_CHROMA(cpu)\ +void x264_mc_chroma_##cpu( uint8_t *dstu, uint8_t *dstv, int i_dst,\ + uint8_t *src, int i_src,\ + int dx, int dy, int i_width, int i_height ); +MC_CHROMA(mmxext) +MC_CHROMA(sse2) +MC_CHROMA(sse2_misalign) +MC_CHROMA(ssse3) +MC_CHROMA(ssse3_cache64) + #define LOWRES(cpu)\ void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\ int src_stride, int dst_stride, int width, int height ); @@ -343,11 +368,13 @@ HPEL(16, ssse3, ssse3, ssse3, ssse3) #endif HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2) -static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h) +static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h ) { if( w < 256 ) { // tiny resolutions don't want non-temporal hints. dunno the exact threshold. x264_plane_copy_c( dst, i_dst, src, i_src, w, h ); - } else if(i_src > 0) { + } else if( !(w&15) ) { + x264_plane_copy_core_mmxext( dst, i_dst, src, i_src, w, h ); + } else if( i_src > 0 ) { // have to use plain memcpy on the last line (in memory order) to avoid overreading src x264_plane_copy_core_mmxext( dst, i_dst, src, i_src, (w+15)&~15, h-1 ); memcpy( dst+i_dst*(h-1), src+i_src*(h-1), w ); @@ -356,6 +383,27 @@ static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i x264_plane_copy_core_mmxext( dst+i_dst, i_dst, src+i_src, i_src, (w+15)&~15, h-1 ); } } + +#define PLANE_INTERLEAVE(cpu) \ +static void x264_plane_copy_interleave_##cpu( uint8_t *dst, int i_dst,\ + uint8_t *srcu, int i_srcu,\ + uint8_t *srcv, int i_srcv, int w, int h )\ +{\ + if( !(w&15) ) {\ + x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ + } else if( w < 16 || (i_srcu ^ i_srcv) ) {\ + x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ + } else if( i_srcu > 0 ) {\ + x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+15)&~15, h-1 );\ + x264_plane_copy_interleave_c( dst+i_dst*(h-1), 0, srcu+i_srcu*(h-1), 0, srcv+i_srcv*(h-1), 0, w, 1 );\ + } else {\ + x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\ + x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+15)&~15, h-1 );\ + }\ +} + +PLANE_INTERLEAVE(mmxext) +PLANE_INTERLEAVE(sse2) #endif // !X264_HIGH_BIT_DEPTH void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) @@ -396,7 +444,14 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmxext; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmxext; + pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmxext; + pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_mmx; + pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_mmx; + pf->plane_copy = x264_plane_copy_mmxext; + pf->plane_copy_interleave = x264_plane_copy_interleave_mmxext; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_mmx; + pf->hpel_filter = x264_hpel_filter_mmxext; pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext; @@ -450,10 +505,16 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) if( cpu&X264_CPU_SSE_MISALIGN ) pf->hpel_filter = x264_hpel_filter_sse2_misalign; pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2; - pf->mc_chroma = x264_mc_chroma_sse2; + if( !(cpu&X264_CPU_STACK_MOD4) ) + pf->mc_chroma = x264_mc_chroma_sse2; if( cpu&X264_CPU_SSE2_IS_FAST ) { + pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_sse2; // FIXME sse2fast? sse2medium? + pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_sse2; + pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_sse2; + pf->plane_copy_interleave = x264_plane_copy_interleave_sse2; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2; pf->mc_luma = mc_luma_sse2; pf->get_ref = get_ref_sse2; if( cpu&X264_CPU_CACHELINE_64 ) @@ -462,7 +523,10 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->get_ref = get_ref_cache64_sse2; } if( cpu&X264_CPU_SSE_MISALIGN ) + { pf->get_ref = get_ref_sse2_misalign; + pf->mc_chroma = x264_mc_chroma_sse2_misalign; + } } if( !(cpu&X264_CPU_SSSE3) ) @@ -477,12 +541,19 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3; + pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_ssse3; + pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_ssse3; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3; + pf->hpel_filter = x264_hpel_filter_ssse3; pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; - pf->mc_chroma = x264_mc_chroma_ssse3; + if( !(cpu&X264_CPU_STACK_MOD4) ) + pf->mc_chroma = x264_mc_chroma_ssse3; + if( cpu&X264_CPU_CACHELINE_64 ) { - pf->mc_chroma = x264_mc_chroma_ssse3_cache64; + if( !(cpu&X264_CPU_STACK_MOD4) ) + pf->mc_chroma = x264_mc_chroma_ssse3_cache64; pf->mc_luma = mc_luma_cache64_ssse3; pf->get_ref = get_ref_cache64_ssse3; diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 1756f86..628ad34 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -40,6 +40,7 @@ hmul_8p: times 8 db 1 times 4 db 1, -1 mask_10: times 4 dw 0, -1 mask_1100: times 2 dd 0, -1 +deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 SECTION .text @@ -304,6 +305,55 @@ SSD 4, 4, ssse3 SSD 4, 8, ssse3 %assign function_align 16 +;----------------------------------------------------------------------------- +; uint64_t pixel_ssd_nv12_core( uint8_t *pixuv1, int stride1, uint8_t *pixuv2, int stride2, int width, int height ) +;----------------------------------------------------------------------------- +%macro SSD_NV12 1-2 0 +cglobal pixel_ssd_nv12_core_%1, 6,7 + shl r4d, 1 + add r0, r4 + add r2, r4 + pxor m3, m3 + pxor m4, m4 + mova m5, [pw_00ff] +.loopy: + mov r6, r4 + neg r6 +.loopx: + mova m0, [r0+r6] + mova m1, [r2+r6] + psubusb m0, m1 + psubusb m1, [r0+r6] + por m0, m1 + mova m2, m0 + pand m0, m5 + psrlw m2, 8 + pmaddwd m0, m0 + pmaddwd m2, m2 + paddd m3, m0 + paddd m4, m2 + add r6, mmsize + jl .loopx + add r0, r1 + add r2, r3 + dec r5d + jg .loopy + HADDD m3, m0 + HADDD m4, m0 + movd eax, m3 + movd edx, m4 +%ifdef ARCH_X86_64 + shl rdx, 32 + add rax, rdx +%endif + RET +%endmacro ; SSD_NV12 + +INIT_MMX +SSD_NV12 mmxext +INIT_XMM +SSD_NV12 sse2 + ;============================================================================= ; variance ;============================================================================= @@ -2158,9 +2208,7 @@ cglobal pixel_ssim_end4_sse2, 3,3,7 add r6, 4*%1 sub r0d, 4*%1 jg .loop -%ifdef WIN64 - RESTORE_XMM rsp -%endif + WIN64_RESTORE_XMM rsp jmp ads_mvs %endmacro diff --git a/common/x86/pixel.h b/common/x86/pixel.h index b1b916d..8ef0afd 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -99,6 +99,10 @@ void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * ); void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * ); void x264_intra_sa8d_x3_8x8_core_ssse3 ( uint8_t *, int16_t [2][8], int * ); +uint64_t x264_pixel_ssd_nv12_core_mmxext( uint8_t *pixuv1, int stride1, + uint8_t *pixuv2, int stride2, int width, int height ); +uint64_t x264_pixel_ssd_nv12_core_sse2( uint8_t *pixuv1, int stride1, + uint8_t *pixuv2, int stride2, int width, int height ); void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1, const uint8_t *pix2, int stride2, int sums[2][4] ); void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index f006f37..a2ccb2e 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -271,13 +271,21 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] ASSERT %2 >= %1 %assign regs_used %2 ASSERT regs_used <= 7 - %assign xmm_regs_used %3 - ASSERT xmm_regs_used <= 16 %if regs_used > 4 push r4 push r5 %assign stack_offset stack_offset+16 %endif + WIN64_SPILL_XMM %3 + LOAD_IF_USED 4, %1 + LOAD_IF_USED 5, %1 + LOAD_IF_USED 6, %1 + DEFINE_ARGS %4 +%endmacro + +%macro WIN64_SPILL_XMM 1 + %assign xmm_regs_used %1 + ASSERT xmm_regs_used <= 16 %if xmm_regs_used > 6 sub rsp, (xmm_regs_used-6)*16+16 %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16 @@ -287,13 +295,9 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i %endrep %endif - LOAD_IF_USED 4, %1 - LOAD_IF_USED 5, %1 - LOAD_IF_USED 6, %1 - DEFINE_ARGS %4 %endmacro -%macro RESTORE_XMM_INTERNAL 1 +%macro WIN64_RESTORE_XMM_INTERNAL 1 %if xmm_regs_used > 6 %assign %%i xmm_regs_used %rep (xmm_regs_used-6) @@ -304,14 +308,14 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] %endif %endmacro -%macro RESTORE_XMM 1 - RESTORE_XMM_INTERNAL %1 +%macro WIN64_RESTORE_XMM 1 + WIN64_RESTORE_XMM_INTERNAL %1 %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 %assign xmm_regs_used 0 %endmacro %macro RET 0 - RESTORE_XMM_INTERNAL rsp + WIN64_RESTORE_XMM_INTERNAL rsp %if regs_used > 4 pop r5 pop r4 @@ -428,6 +432,13 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] %endif ;====================================================================== +%ifndef WIN64 +%macro WIN64_SPILL_XMM 1 +%endmacro +%macro WIN64_RESTORE_XMM 1 +%endmacro +%endif + ;============================================================================= @@ -494,7 +505,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %define mova movq %define movu movq %define movh movd - %define movnt movntq + %define movnta movntq %assign %%i 0 %rep 8 CAT_XDEFINE m, %%i, mm %+ %%i @@ -518,7 +529,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %define mova movdqa %define movu movdqu %define movh movq - %define movnt movntdq + %define movnta movntdq %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, xmm %+ %%i diff --git a/encoder/analyse.c b/encoder/analyse.c index 93f7eed..ca6a7f0 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -1124,8 +1124,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \ - (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \ - (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \ + (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>1)*(m)->i_stride[1]]; \ (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->weight = weight_none; \ (m)->i_ref = ref; @@ -1500,17 +1499,17 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, ALIGNED_ARRAY_8( pixel, pix1,[16*8] ); pixel *pix2 = pix1+8; const int i_stride = h->mb.pic.i_stride[1]; - const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride; + const int or = 8*(i8x8&1) + 2*(i8x8&2)*i_stride; const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE; const int i_ref = a->l0.me8x8[i8x8].i_ref; const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; x264_weight_t *weight = h->sh.weight[i_ref]; + // FIXME weight can be done on 4x4 blocks even if mc is smaller #define CHROMA4x4MC( width, height, me, x, y ) \ - h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \ + h->mc.mc_chroma( &pix1[x+y*16], &pix2[x+y*16], 16, &p_fref[4][or+x*2+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \ if( weight[1].weightfn ) \ weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \ - h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \ if( weight[2].weightfn ) \ weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height ); diff --git a/encoder/encoder.c b/encoder/encoder.c index f7e0e38..d1807c8 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -68,13 +68,17 @@ static void x264_frame_dump( x264_t *h ) FILE *f = fopen( h->param.psz_dump_yuv, "r+b" ); if( !f ) return; - int bytes_per_pixel = (BIT_DEPTH+7)/8; /* Write the frame in display order */ - fseek( f, (uint64_t)h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2 * bytes_per_pixel, SEEK_SET ); - for( int i = 0; i < h->fdec->i_plane; i++ ) - for( int y = 0; y < h->param.i_height >> !!i; y++ ) - for( int j = 0; j < h->param.i_width >> !!i; j++ ) - fwrite( &h->fdec->plane[i][y*h->fdec->i_stride[i]]+j, bytes_per_pixel, 1, f ); + fseek( f, (uint64_t)h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2 * sizeof(pixel), SEEK_SET ); + for( int y = 0; y < h->param.i_height; y++ ) + fwrite( &h->fdec->plane[0][y*h->fdec->i_stride[0]], sizeof(pixel), h->param.i_width, f ); + int cw = h->param.i_width>>1; + int ch = h->param.i_height>>1; + pixel *planeu = x264_malloc( cw*ch*2*sizeof(pixel) ); + pixel *planev = planeu + cw*ch; + h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch ); + fwrite( planeu, 1, cw*ch*2*sizeof(pixel), f ); + x264_free( planeu ); fclose( f ); } @@ -403,9 +407,9 @@ static int x264_validate_parameters( x264_t *h ) return -1; } int i_csp = h->param.i_csp & X264_CSP_MASK; - if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 ) + if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 && i_csp != X264_CSP_NV12 ) { - x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12 supported)\n" ); + x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12 supported)\n" ); return -1; } @@ -1630,15 +1634,6 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop ) if( min_y < h->i_threadslice_start ) return; - if( !b_end && b_inloop ) - for( int j = 0; j <= h->sh.b_mbaff; j++ ) - for( int i = 0; i < 3; i++ ) - { - memcpy( h->intra_border_backup[j][i], - h->fdec->plane[i] + ((mb_y*16 >> !!i) + j - 1 - h->sh.b_mbaff) * h->fdec->i_stride[i], - (h->mb.i_mb_width*16 >> !!i) * sizeof(pixel) ); - } - if( b_deblock ) for( int y = min_y; y < max_y; y += (1 << h->sh.b_mbaff) ) x264_frame_deblock_row( h, y ); @@ -1663,12 +1658,19 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop ) if( b_measure_quality ) { if( h->param.analyse.b_psnr ) - for( int i = 0; i < 3; i++ ) - h->stat.frame.i_ssd[i] += - x264_pixel_ssd_wxh( &h->pixf, - h->fdec->plane[i] + (min_y>>!!i) * h->fdec->i_stride[i], h->fdec->i_stride[i], - h->fenc->plane[i] + (min_y>>!!i) * h->fenc->i_stride[i], h->fenc->i_stride[i], - h->param.i_width >> !!i, (max_y-min_y) >> !!i ); + { + uint64_t ssd_y = x264_pixel_ssd_wxh( &h->pixf, + h->fdec->plane[0] + min_y * h->fdec->i_stride[0], h->fdec->i_stride[0], + h->fenc->plane[0] + min_y * h->fenc->i_stride[0], h->fenc->i_stride[0], + h->param.i_width, max_y-min_y ); + uint64_t ssd_uv = x264_pixel_ssd_nv12( &h->pixf, + h->fdec->plane[1] + (min_y>>1) * h->fdec->i_stride[1], h->fdec->i_stride[1], + h->fenc->plane[1] + (min_y>>1) * h->fenc->i_stride[1], h->fenc->i_stride[1], + h->param.i_width>>1, (max_y-min_y)>>1 ); + h->stat.frame.i_ssd[0] += ssd_y; + h->stat.frame.i_ssd[1] += (uint32_t)ssd_uv; + h->stat.frame.i_ssd[2] += ssd_uv>>32; + } if( h->param.analyse.b_ssim ) { @@ -2715,8 +2717,9 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current, if( pic_out->i_pts < pic_out->i_dts ) x264_log( h, X264_LOG_WARNING, "invalid DTS: PTS is less than DTS\n" ); + pic_out->img.i_csp = X264_CSP_NV12; pic_out->img.i_plane = h->fdec->i_plane; - for( int i = 0; i < 3; i++ ) + for( int i = 0; i < 2; i++ ) { pic_out->img.i_stride[i] = h->fdec->i_stride[i]; // FIXME This breaks the API when pixel != uint8_t. diff --git a/encoder/macroblock.c b/encoder/macroblock.c index 557aa55..99cb433 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -492,25 +492,16 @@ static void x264_macroblock_encode_pskip( x264_t *h ) /* Special case for mv0, which is (of course) very common in P-skip mode. */ if( mvx | mvy ) - { - h->mc.mc_chroma( h->mb.pic.p_fdec[1], FDEC_STRIDE, + h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], mvx, mvy, 8, 8 ); - h->mc.mc_chroma( h->mb.pic.p_fdec[2], FDEC_STRIDE, - h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2], - mvx, mvy, 8, 8 ); - } else - { - h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], 8 ); - h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2], 8 ); - } + h->mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] ); if( h->sh.weight[0][1].weightfn ) h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &h->sh.weight[0][1], 8 ); - if( h->sh.weight[0][2].weightfn ) h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, @@ -524,25 +515,21 @@ static void x264_macroblock_encode_pskip( x264_t *h ) * Intra prediction for predictive lossless mode. *****************************************************************************/ -/* Note that these functions take a shortcut (mc.copy instead of actual pixel prediction) which assumes - * that the edge pixels of the reconstructed frame are the same as that of the source frame. This means - * they will only work correctly if the neighboring blocks are losslessly coded. In practice, this means - * lossless mode cannot be mixed with lossy mode within a frame. */ -/* This can be resolved by explicitly copying the edge pixels after doing the mc.copy, but this doesn't - * need to be done unless we decide to allow mixing lossless and lossy compression. */ - void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode ) { - int stride = h->fenc->i_stride[1] << h->mb.b_interlaced; if( i_mode == I_PRED_CHROMA_V ) { - h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc_plane[1]-stride, stride, 8 ); - h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc_plane[2]-stride, stride, 8 ); + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, 8 ); + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, 8 ); + memcpy( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[1]-FDEC_STRIDE, 8*sizeof(pixel) ); + memcpy( h->mb.pic.p_fdec[2], h->mb.pic.p_fdec[2]-FDEC_STRIDE, 8*sizeof(pixel) ); } else if( i_mode == I_PRED_CHROMA_H ) { - h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc_plane[1]-1, stride, 8 ); - h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc_plane[2]-1, stride, 8 ); + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, 8 ); + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, 8 ); + x264_copy_column8( h->mb.pic.p_fdec[1]+4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+4*FDEC_STRIDE-1 ); + x264_copy_column8( h->mb.pic.p_fdec[2]+4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+4*FDEC_STRIDE-1 ); } else { @@ -965,28 +952,26 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir ) i_qp = h->mb.i_chroma_qp; thresh = (x264_lambda2_tab[i_qp] + 32) >> 6; + if( !b_bidir ) + { + /* Special case for mv0, which is (of course) very common in P-skip mode. */ + if( M32( mvp ) ) + h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE, + h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], + mvp[0], mvp[1], 8, 8 ); + else + h->mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] ); + } + for( int ch = 0; ch < 2; ch++ ) { pixel *p_src = h->mb.pic.p_fenc[1+ch]; pixel *p_dst = h->mb.pic.p_fdec[1+ch]; - if( !b_bidir ) - { - /* Special case for mv0, which is (of course) very common in P-skip mode. */ - if( M32( mvp ) ) - { - h->mc.mc_chroma( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, - h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch], - mvp[0], mvp[1], 8, 8 ); - } - else - h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch], 8 ); - - if( h->sh.weight[0][1+ch].weightfn ) - h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, - h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, - &h->sh.weight[0][1+ch], 8 ); - } + if( !b_bidir && h->sh.weight[0][1+ch].weightfn ) + h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, + h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, + &h->sh.weight[0][1+ch], 8 ); /* there is almost never a termination during chroma, but we can't avoid the check entirely */ /* so instead we check SSD and skip the actual check if the score is low enough. */ diff --git a/encoder/me.c b/encoder/me.c index 19c5b2b..502246e 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -789,18 +789,17 @@ if( b_refine_qpel || (dir^1) != odir ) \ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ if( b_chroma_me && cost < bcost ) \ { \ - h->mc.mc_chroma( pix, 8, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \ + h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \ if( m->weight[1].weightfn ) \ - m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 8, pix, 8, \ + m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 16, pix, 16, \ &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \ - cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix, 8 ); \ + cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \ if( cost < bcost ) \ { \ - h->mc.mc_chroma( pix, 8, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \ if( m->weight[2].weightfn ) \ - m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 8, pix, 8, \ + m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix+8, 16, pix+8, 16, \ &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \ - cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix, 8 ); \ + cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \ } \ } \ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, bdir, dir ); \ @@ -923,10 +922,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite stride[list][i] = bw;\ src[list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[list][i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none );\ if( rd )\ - {\ - h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\ - h->mc.mc_chroma( pixv_buf[list][i], 8, m->p_fref[5], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\ - }\ + h->mc.mc_chroma( pixu_buf[list][i], pixv_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\ } #define SATD_THRESH 17/16 @@ -1111,10 +1107,7 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei uint64_t cost; \ M32( cache_mv ) = pack16to32_mask(mx,my); \ if( m->i_pixel <= PIXEL_8x8 ) \ - { \ - h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \ - h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \ - } \ + h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \ cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \ } \ diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c index a2c4cfe..2b8d595 100644 --- a/encoder/ratecontrol.c +++ b/encoder/ratecontrol.c @@ -204,22 +204,32 @@ static inline double qscale2bits( ratecontrol_entry_t *rce, double qscale ) + rce->misc_bits; } +static ALWAYS_INLINE uint32_t ac_energy_var( uint64_t sum_ssd, int shift, x264_frame_t *frame, int i ) +{ + uint32_t sum = sum_ssd; + uint32_t ssd = sum_ssd >> 32; + frame->i_pixel_sum[i] += sum; + frame->i_pixel_ssd[i] += ssd; + return ssd - ((uint64_t)sum * sum >> shift); +} + static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i ) { int w = i ? 8 : 16; - int shift = i ? 6 : 8; int stride = frame->i_stride[i]; int offset = h->mb.b_interlaced - ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride - : w * (mb_x + mb_y * stride); - int pix = i ? PIXEL_8x8 : PIXEL_16x16; + ? 16 * mb_x + w * (mb_y&~1) * stride + (mb_y&1) * stride + : 16 * mb_x + w * mb_y * stride; stride <<= h->mb.b_interlaced; - uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride ); - uint32_t sum = (uint32_t)res; - uint32_t ssd = res >> 32; - frame->i_pixel_sum[i] += sum; - frame->i_pixel_ssd[i] += ssd; - return ssd - ((uint64_t)sum * sum >> shift); + if( i ) + { + ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*8] ); + h->mc.load_deinterleave_8x8x2_fenc( pix, frame->plane[1] + offset, stride ); + return ac_energy_var( h->pixf.var[PIXEL_8x8]( pix, FENC_STRIDE ), 6, frame, i ) + + ac_energy_var( h->pixf.var[PIXEL_8x8]( pix+FENC_STRIDE/2, FENC_STRIDE ), 6, frame, i ); + } + else + return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[0] + offset, stride ), 8, frame, i ); } // Find the total AC energy of the block in all planes. @@ -231,7 +241,6 @@ static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_ * sure no reordering goes on. */ uint32_t var = ac_energy_plane( h, mb_x, mb_y, frame, 0 ); var += ac_energy_plane( h, mb_x, mb_y, frame, 1 ); - var += ac_energy_plane( h, mb_x, mb_y, frame, 2 ); x264_emms(); return var; } diff --git a/tools/checkasm.c b/tools/checkasm.c index a5ffa17..42aa3b7 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -444,6 +444,25 @@ static int check_pixel( int cpu_ref, int cpu_new ) TEST_INTRA_MBCMP( intra_sad_x3_4x4 , predict_4x4 , sad [PIXEL_4x4] , 0 ); report( "intra sad_x3 :" ); + ok = 1; used_asm = 0; + if( pixel_asm.ssd_nv12_core != pixel_ref.ssd_nv12_core ) + { + used_asm = 1; + set_func_name( "ssd_nv12" ); + uint64_t res_c = pixel_c.ssd_nv12_core( pbuf1, 368, pbuf2, 368, 360, 8 ); + uint64_t res_a = pixel_asm.ssd_nv12_core( pbuf1, 368, pbuf2, 368, 360, 8 ); + if( res_c != res_a ) + { + ok = 0; + fprintf( stderr, "ssd_nv12: %u,%u != %u,%u\n", + (uint32_t)res_c, (uint32_t)(res_c>>32), + (uint32_t)res_a, (uint32_t)(res_a>>32) ); + } + call_c( pixel_c.ssd_nv12_core, pbuf1, 368, pbuf2, 368, 360, 8 ); + call_a( pixel_asm.ssd_nv12_core, pbuf1, 368, pbuf2, 368, 360, 8 ); + } + report( "ssd_nv12 :" ); + if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core || pixel_asm.ssim_end4 != pixel_ref.ssim_end4 ) { @@ -822,12 +841,15 @@ static int check_mc( int cpu_ref, int cpu_new ) used_asm = 1; \ for( int i = 0; i < 1024; i++ ) \ pbuf3[i] = pbuf4[i] = 0xCD; \ - call_c( mc_c.mc_chroma, dst1, 16, src, 64, dx, dy, w, h ); \ - call_a( mc_a.mc_chroma, dst2, 16, src, 64, dx, dy, w, h ); \ + call_c( mc_c.mc_chroma, dst1, dst1+8, 16, src, 64, dx, dy, w, h ); \ + call_a( mc_a.mc_chroma, dst2, dst2+8, 16, src, 64, dx, dy, w, h ); \ /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */ \ for( int j = 0; j < h; j++ ) \ - for( int i = w; i < 4; i++ ) \ + for( int i = w; i < 8; i++ ) \ + { \ + dst2[i+j*16+8] = dst1[i+j*16+8]; \ dst2[i+j*16] = dst1[i+j*16]; \ + } \ if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \ { \ fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \ @@ -969,6 +991,121 @@ static int check_mc( int cpu_ref, int cpu_new ) } report( "mc offsetsub :" ); + ok = 1; used_asm = 0; + if( mc_a.store_interleave_8x8x2 != mc_ref.store_interleave_8x8x2 ) + { + set_func_name( "store_interleave_8x8x2" ); + used_asm = 1; + memset( pbuf3, 0, 64*8 ); + memset( pbuf4, 0, 64*8 ); + call_c( mc_c.store_interleave_8x8x2, pbuf3, 64, pbuf1, pbuf1+16 ); + call_a( mc_a.store_interleave_8x8x2, pbuf4, 64, pbuf1, pbuf1+16 ); + if( memcmp( pbuf3, pbuf4, 64*8 ) ) + ok = 0; + } + if( mc_a.load_deinterleave_8x8x2_fenc != mc_ref.load_deinterleave_8x8x2_fenc ) + { + set_func_name( "load_deinterleave_8x8x2_fenc" ); + used_asm = 1; + call_c( mc_c.load_deinterleave_8x8x2_fenc, pbuf3, pbuf1, 64 ); + call_a( mc_a.load_deinterleave_8x8x2_fenc, pbuf4, pbuf1, 64 ); + if( memcmp( pbuf3, pbuf4, FENC_STRIDE*8 ) ) + ok = 0; + } + if( mc_a.load_deinterleave_8x8x2_fdec != mc_ref.load_deinterleave_8x8x2_fdec ) + { + set_func_name( "load_deinterleave_8x8x2_fdec" ); + used_asm = 1; + call_c( mc_c.load_deinterleave_8x8x2_fdec, pbuf3, pbuf1, 64 ); + call_a( mc_a.load_deinterleave_8x8x2_fdec, pbuf4, pbuf1, 64 ); + if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*8 ) ) + ok = 0; + } + report( "store_interleave :" ); + + struct plane_spec { + int w, h, src_stride; + } plane_specs[] = { {2,2,2}, {8,6,8}, {20,31,24}, {32,8,40}, {256,10,272}, {504,7,505}, {528,6,528}, {256,10,-256}, {263,9,-264}, {1904,1,0} }; + ok = 1; used_asm = 0; + if( mc_a.plane_copy != mc_ref.plane_copy ) + { + set_func_name( "plane_copy" ); + used_asm = 1; + for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ ) + { + int w = plane_specs[i].w; + int h = plane_specs[i].h; + int src_stride = plane_specs[i].src_stride; + int dst_stride = (w + 127) & ~63; + assert( dst_stride * h <= 0x1000 ); + uint8_t *src1 = buf1 + X264_MAX(0, -src_stride) * (h-1); + memset( pbuf3, 0, 0x1000*sizeof(pixel) ); + memset( pbuf4, 0, 0x1000*sizeof(pixel) ); + call_c( mc_c.plane_copy, pbuf3, dst_stride, src1, src_stride, w, h ); + call_a( mc_a.plane_copy, pbuf4, dst_stride, src1, src_stride, w, h ); + for( int y = 0; y < h; y++ ) + if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(pixel) ) ) + { + ok = 0; + fprintf( stderr, "plane_copy FAILED: w=%d h=%d stride=%d\n", w, h, src_stride ); + break; + } + } + } + + if( mc_a.plane_copy_interleave != mc_ref.plane_copy_interleave ) + { + set_func_name( "plane_copy_interleave" ); + used_asm = 1; + for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ ) + { + int w = (plane_specs[i].w + 1) >> 1; + int h = plane_specs[i].h; + int src_stride = (plane_specs[i].src_stride + 1) >> 1; + int dst_stride = (2*w + 127) & ~63; + assert( dst_stride * h <= 0x1000 ); + uint8_t *src1 = buf1 + X264_MAX(0, -src_stride) * (h-1); + memset( pbuf3, 0, 0x1000*sizeof(pixel) ); + memset( pbuf4, 0, 0x1000*sizeof(pixel) ); + call_c( mc_c.plane_copy_interleave, pbuf3, dst_stride, src1, src_stride, src1+1024, src_stride+16, w, h ); + call_a( mc_a.plane_copy_interleave, pbuf4, dst_stride, src1, src_stride, src1+1024, src_stride+16, w, h ); + for( int y = 0; y < h; y++ ) + if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, 2*w*sizeof(pixel) ) ) + { + ok = 0; + fprintf( stderr, "plane_copy_interleave FAILED: w=%d h=%d stride=%d\n", w, h, src_stride ); + break; + } + } + } + + if( mc_a.plane_copy_deinterleave != mc_ref.plane_copy_deinterleave ) + { + set_func_name( "plane_copy_deinterleave" ); + used_asm = 1; + for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ ) + { + int w = (plane_specs[i].w + 1) >> 1; + int h = plane_specs[i].h; + int dst_stride = w; + int src_stride = (2*w + 127) & ~63; + int offv = (dst_stride*h + 31) & ~15; + memset( pbuf3, 0, 0x1000 ); + memset( pbuf4, 0, 0x1000 ); + call_c( mc_c.plane_copy_deinterleave, pbuf3, dst_stride, pbuf3+offv, dst_stride, pbuf1, src_stride, w, h ); + call_a( mc_a.plane_copy_deinterleave, pbuf4, dst_stride, pbuf4+offv, dst_stride, pbuf1, src_stride, w, h ); + for( int y = 0; y < h; y++ ) + if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w ) || + memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w ) ) + { + ok = 0; + fprintf( stderr, "plane_copy_deinterleave FAILED: w=%d h=%d stride=%d\n", w, h, src_stride ); + break; + } + } + } + report( "plane_copy :" ); + if( mc_a.hpel_filter != mc_ref.hpel_filter ) { pixel *srchpel = pbuf1+8+2*64; diff --git a/x264.h b/x264.h index 4d9b9ca..f280927 100644 --- a/x264.h +++ b/x264.h @@ -181,7 +181,8 @@ static const char * const x264_open_gop_names[] = { "none", "normal", "bluray", #define X264_CSP_RGB 0x0006 /* rgb 24bits */ #define X264_CSP_BGR 0x0007 /* bgr 24bits */ #define X264_CSP_BGRA 0x0008 /* bgr 32bits */ -#define X264_CSP_MAX 0x0009 /* end of list */ +#define X264_CSP_NV12 0x0009 /* yuv 4:2:0, with one y plane and one packed u+v */ +#define X264_CSP_MAX 0x0010 /* end of list */ #define X264_CSP_VFLIP 0x1000 /* */ /* Slice type */ -- 1.7.0.4