Index: encoder/encoder.c =================================================================== --- encoder/encoder.c (revision 516) +++ encoder/encoder.c (working copy) @@ -610,6 +610,10 @@ for( i = 1; i < param->i_threads; i++ ) h->thread[i] = x264_malloc( sizeof(x264_t) ); +#ifndef ARCH_X86_64 + x264_log( h, X264_LOG_WARNING, "edge-detection is asm'ed only on x86_64 linux\n" ); +#endif + return h; } Index: encoder/analyse.c =================================================================== --- encoder/analyse.c (revision 516) +++ encoder/analyse.c (working copy) @@ -82,6 +82,7 @@ /* I: Intra part */ /* Take some shortcuts in intra search if intra is deemed unlikely */ int b_fast_intra; + int b_edge_intra; int b_try_pskip; /* Luma part */ @@ -92,9 +93,11 @@ int i_satd_i8x8; int i_satd_i8x8_dir[12][4]; int i_predict8x8[4]; + int i_edge_i8x8[4]; int i_satd_i4x4; int i_predict4x4[16]; + int i_edge_i4x4[16]; /* Chroma part */ int i_satd_i8x8chroma; @@ -373,27 +376,49 @@ /* MAX = 9 */ static void predict_4x4_mode_available( unsigned int i_neighbour, - int *mode, int *pi_count ) + int *mode, int *pi_count, int edge ) { + static const int dir_neighbors[][2] = { + [I_PRED_4x4_HU ]= { I_PRED_4x4_DDL, I_PRED_4x4_H }, + [I_PRED_4x4_H ]= { I_PRED_4x4_HU, I_PRED_4x4_HD }, + [I_PRED_4x4_HD ]= { I_PRED_4x4_H, I_PRED_4x4_DDR }, + [I_PRED_4x4_DDR]= { I_PRED_4x4_HD, I_PRED_4x4_VR }, + [I_PRED_4x4_VR ]= { I_PRED_4x4_DDR, I_PRED_4x4_V }, + [I_PRED_4x4_V ]= { I_PRED_4x4_VR, I_PRED_4x4_VL }, + [I_PRED_4x4_VL ]= { I_PRED_4x4_V, I_PRED_4x4_DDL }, + [I_PRED_4x4_DDL]= { I_PRED_4x4_VL, I_PRED_4x4_HU } + }; int b_l = i_neighbour & MB_LEFT; int b_t = i_neighbour & MB_TOP; if( b_l && b_t ) { - *pi_count = 6; - *mode++ = I_PRED_4x4_DC; - *mode++ = I_PRED_4x4_H; - *mode++ = I_PRED_4x4_V; - *mode++ = I_PRED_4x4_DDL; - if( i_neighbour & MB_TOPLEFT ) + int b_tl = i_neighbour & MB_TOPLEFT; + if( b_tl && edge >= 0 ) { - *mode++ = I_PRED_4x4_DDR; - *mode++ = I_PRED_4x4_VR; - *mode++ = I_PRED_4x4_HD; - *pi_count += 3; + *pi_count = 4; + *mode++ = I_PRED_4x4_DC; + *mode++ = edge; + *mode++ = dir_neighbors[edge][0]; + *mode++ = dir_neighbors[edge][1]; } - *mode++ = I_PRED_4x4_VL; - *mode++ = I_PRED_4x4_HU; + else + { + *pi_count = 6; + *mode++ = I_PRED_4x4_DC; + *mode++ = I_PRED_4x4_H; + *mode++ = I_PRED_4x4_V; + *mode++ = I_PRED_4x4_DDL; + if( i_neighbour & MB_TOPLEFT ) + { + *mode++ = I_PRED_4x4_DDR; + *mode++ = I_PRED_4x4_VR; + *mode++ = I_PRED_4x4_HD; + *pi_count += 3; + } + *mode++ = I_PRED_4x4_VL; + *mode++ = I_PRED_4x4_HU; + } } else if( b_l ) { @@ -493,6 +518,14 @@ if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter ) return; + a->b_edge_intra = !a->b_mbrd && (flags & X264_ANALYSE_I4x4); + if( a->b_edge_intra ) + { + int stride = h->fenc->i_stride[0]; + uint8_t *fenc = h->fenc->plane[0] + 16*(h->mb.i_mb_x+h->mb.i_mb_y*stride); + h->pixf.edge_detect( fenc, stride, a->i_edge_i4x4, a->i_edge_i8x8 ); + } + /* 8x8 prediction selection */ if( flags & X264_ANALYSE_I8x8 ) { @@ -513,7 +546,8 @@ int i_best = COST_MAX; int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx ); - predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max ); + predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max, + a->b_edge_intra ? a->i_edge_i8x8[idx] : -1 ); for( i = 0; i < i_max; i++ ) { int i_satd; @@ -571,7 +605,8 @@ int i_best = COST_MAX; int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx ); - predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max ); + predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max, + a->b_edge_intra ? a->i_edge_i4x4[idx] : -1 ); if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP ) /* emulate missing topright samples */ @@ -681,7 +716,7 @@ p_src_by = p_src + 4*x + 4*y*FENC_STRIDE; p_dst_by = p_dst + 4*x + 4*y*FDEC_STRIDE; - predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max ); + predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max, -1 ); if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP ) /* emulate missing topright samples */ @@ -733,7 +768,7 @@ p_src_by = p_src + 8*x + 8*y*FENC_STRIDE; p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE; - predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max ); + predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max, -1 ); for( i = 0; i < i_max; i++ ) { i_mode = predict_mode[i]; Index: common/i386/pixel.h =================================================================== --- common/i386/pixel.h (revision 516) +++ common/i386/pixel.h (working copy) @@ -90,4 +90,6 @@ int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int ); int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int ); +void x264_pixel_edge_detect_sse2( uint8_t *pix, int stride, int *res4, int *res8 ); + #endif Index: common/pixel.c =================================================================== --- common/pixel.c (revision 516) +++ common/pixel.c (working copy) @@ -322,6 +322,66 @@ SAD_X( 8x8_vis ) #endif + +static void edge_detect( uint8_t *pix, int stride, int res4[16], int res8[4] ) +{ +#define TAN_1_16 13 // FIX6(0.198912), tan(pi*1/16) +#define TAN_3_16 43 // FIX6(0.668179), tan(pi*3/16) + int x,y,i,idx; + int dir4[4][4][9] = {{{0}}}; + int dir8[2][2][9] = {{{0}}}; + + for( y=0; y<16; y++ ) + for( x=0; x<16; x++ ) + { + uint8_t *p = &pix[x+y*stride]; + int dx = p[0] - p[-1] + p[-stride] - p[-1-stride]; + int dy = p[0] + p[-1] - p[-stride] - p[-1-stride]; + int ax = abs(dx); + int ay = abs(dy); + int quadrant = (dx^dy) < 0; + int dir; + + if( 64*ay < TAN_1_16*ax ) + dir = I_PRED_4x4_V; + else if( 64*ax < TAN_1_16*ay ) + dir = I_PRED_4x4_H; + else if( 64*ay < TAN_3_16*ax ) + dir = quadrant ? I_PRED_4x4_VR : I_PRED_4x4_VL; + else if( 64*ax < TAN_3_16*ay ) + dir = quadrant ? I_PRED_4x4_HD : I_PRED_4x4_HU; + else + dir = quadrant ? I_PRED_4x4_DDR : I_PRED_4x4_DDL; + + dir4[y>>2][x>>2][dir] += ax+ay; + dir8[y>>3][x>>3][dir] += ax+ay; + } + + for( idx=0; idx<16; idx++ ) + { + int edge = -1, amp = -1; + for( i=0; i<9; i++ ) + if( dir4[0][idx][i] > amp ) + { + edge = i; + amp = dir4[0][idx][i]; + } + res4[idx] = edge; + } + for( idx=0; idx<4; idx++ ) + { + int edge = -1, amp = -1; + for( i=0; i<9; i++ ) + if( dir8[0][idx][i] > amp ) + { + edge = i; + amp = dir8[0][idx][i]; + } + res8[idx] = edge; + } +} + + /**************************************************************************** * x264_pixel_init: ****************************************************************************/ @@ -349,6 +409,8 @@ pixf->sa8d[PIXEL_8x16] = x264_pixel_sa8d_8x16; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8; + pixf->edge_detect = edge_detect; + #ifdef HAVE_MMXEXT if( cpu&X264_CPU_MMX ) { @@ -403,6 +465,8 @@ #ifdef ARCH_X86_64 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2; + + pixf->edge_detect = x264_pixel_edge_detect_sse2; #endif } #endif Index: common/pixel.h =================================================================== --- common/pixel.h (revision 516) +++ common/pixel.h (working copy) @@ -78,6 +78,8 @@ /* multiple parallel calls to sad. */ x264_pixel_cmp_x3_t sad_x3[7]; x264_pixel_cmp_x4_t sad_x4[7]; + + void (*edge_detect)( uint8_t *pix, int stride, int *res4, int *res8 ); } x264_pixel_function_t; void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ); Index: common/amd64/pixel-sse2.asm =================================================================== --- common/amd64/pixel-sse2.asm (revision 516) +++ common/amd64/pixel-sse2.asm (working copy) @@ -30,8 +30,33 @@ SECTION .rodata align=16 +pw_000d: times 8 dw 0x000d +pw_002b: times 8 dw 0x002b pd_0000ffff: times 4 dd 0x0000ffff +dw_0624: +%rep 2 + dw 0 + dw 6 + dw 2 + dw 4 +%endrep +dw_3715: +%rep 2 + dw 3 + dw 7 + dw 1 + dw 5 +%endrep +edge_map: + dd 8 ; HU + dd 7 ; VL + dd 6 ; HD + dd 5 ; VR + dd 4 ; DDR + dd 3 ; DDL + dd 1 ; H + dd 0 ; V SECTION .text @@ -47,6 +72,7 @@ cglobal x264_pixel_satd_16x16_sse2 cglobal x264_pixel_sa8d_8x8_sse2 cglobal x264_pixel_sa8d_16x16_sse2 +cglobal x264_pixel_edge_detect_sse2 %macro SAD_INC_4x16P_SSE2 0 movdqu xmm1, [rdx] @@ -615,3 +641,247 @@ add eax, 1 shr eax, 1 ret + + + +%macro HADDW 3 ; dst, tmp, mem + movdqa %1, %3 + pshufd %2, %1, 10110001b + paddw %1, %2 + movdqa %2, %1 + psrld %2, 16 + paddw %1, %2 +%endmacro + +%macro EDGE_REMAP 2 + mov eax, %1/2 +.remap%1: + mov edi, [%2 + 8*rax - 8] + mov esi, [%2 + 8*rax - 4] + and edi, 7 + and esi, 7 + mov edi, [edge_map + 4*rdi GLOBAL] + mov esi, [edge_map + 4*rsi GLOBAL] + mov [%2 + 8*rax - 8], edi + mov [%2 + 8*rax - 4], esi + dec eax + jnz .remap%1 +%endmacro + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_pixel_edge_detect_sse2( uint8_t *pix, int stride, int *res4, int *res8 ) +;----------------------------------------------------------------------------- +x264_pixel_edge_detect_sse2: + push rbp + mov rbp, rsp + sub rsp, 0x80 + and rsp, -16 + sub parm1q, parm2q +%define amp_sum rsp +%define i8_sum xmm11 + ; { HU, H, HD, DDR, VR, V, VL, DDL } + ; 0, 6, 2, 4, 3, 7, 1, 5 + pxor xmm15, xmm15 + movdqa xmm14, [pw_000d GLOBAL] ; 13 = 64*tan(pi*1/16) + movdqa xmm13, [pw_002b GLOBAL] ; 43 = 64*tan(pi*3/16) + movdqu xmm12, [dw_0624+8 GLOBAL] + pxor i8_sum, i8_sum + + mov r10d, 2 +.loopx: + mov r11d, 4 +.loopy: + movdqa [amp_sum+0x00], xmm15 + movdqa [amp_sum+0x10], xmm15 + movdqa [amp_sum+0x20], xmm15 + movdqa [amp_sum+0x30], xmm15 + movdqa [amp_sum+0x40], xmm15 + movdqa [amp_sum+0x50], xmm15 + movdqa [amp_sum+0x60], xmm15 + movdqa [amp_sum+0x70], xmm15 + mov eax, 4 +.loop4x4: + + ; calculate gradient + movq xmm0, [parm1q+parm2q] + movq xmm1, [parm1q+parm2q-1] + movq xmm2, [parm1q] + movq xmm3, [parm1q-1] + punpcklbw xmm0, xmm15 + punpcklbw xmm1, xmm15 + punpcklbw xmm2, xmm15 + punpcklbw xmm3, xmm15 + psubw xmm0, xmm3 + psubw xmm1, xmm2 + movdqa xmm2, xmm0 + paddw xmm0, xmm1 ; a+b-c-d + psubw xmm1, xmm2 ; c-b+c-d + movdqa xmm9, xmm0 + pxor xmm9, xmm1 + psraw xmm9, 15 ; quadrant + pxor xmm2, xmm2 + pxor xmm3, xmm3 + psubw xmm2, xmm0 + psubw xmm3, xmm1 + pmaxsw xmm0, xmm2 ; abs(dI/dy) + pmaxsw xmm1, xmm3 ; abs(dI/dx) + movdqa xmm8, xmm0 + paddw xmm8, xmm1 ; amp + pcmpeqb xmm7, xmm7 ; ifelse mask + + ; classify gradient into one of 8 directions, + ; and sum the magnitudes of each direction + ; 9.6 fixed-point math + movdqa xmm2, xmm1 + movdqa xmm3, xmm0 + movdqa xmm4, xmm1 + movdqa xmm5, xmm0 + pmullw xmm2, xmm14 + pmullw xmm3, xmm14 + pmullw xmm4, xmm13 + pmullw xmm5, xmm13 + psllw xmm0, 6 + psllw xmm1, 6 + pcmpgtw xmm2, xmm0 ; ay < ax*tan(pi*1/16) + pcmpgtw xmm3, xmm1 ; ax < ay*tan(pi*1/16) + pcmpgtw xmm4, xmm0 ; ay < ax*tan(pi*3/16) + pcmpgtw xmm5, xmm1 ; ax < ay*tan(pi*3/16) + + pxor xmm7, xmm2 + pand xmm2, xmm8 + paddw xmm2, [amp_sum+0x50] + movdqa [amp_sum+0x50], xmm2 ; V + + pand xmm3, xmm7 + pxor xmm7, xmm3 + pand xmm3, xmm8 + paddw xmm3, [amp_sum+0x10] + movdqa [amp_sum+0x10], xmm3 ; H + + pand xmm4, xmm7 + pxor xmm7, xmm4 + pand xmm4, xmm8 + movdqa xmm0, xmm9 + pandn xmm0, xmm4 + pand xmm4, xmm9 + paddw xmm4, [amp_sum+0x60] + paddw xmm0, [amp_sum+0x40] + movdqa [amp_sum+0x60], xmm4 ; VL + movdqa [amp_sum+0x40], xmm0 ; VR + + pand xmm5, xmm7 + pxor xmm7, xmm5 + pand xmm5, xmm8 + movdqa xmm0, xmm9 + pandn xmm0, xmm5 + pand xmm5, xmm9 + paddw xmm5, [amp_sum+0x00] + paddw xmm0, [amp_sum+0x20] + movdqa [amp_sum+0x00], xmm5 ; HU + movdqa [amp_sum+0x20], xmm0 ; HD + + pand xmm7, xmm8 + pand xmm9, xmm7 + pxor xmm7, xmm9 + paddw xmm9, [amp_sum+0x70] + paddw xmm7, [amp_sum+0x30] + movdqa [amp_sum+0x70], xmm9 ; DDL + movdqa [amp_sum+0x30], xmm7 ; DDR + + add parm1q, parm2q + dec eax + jnz .loop4x4 + + ; find which directions were strongest for these two 4x4 blocks + HADDW xmm0, xmm8, [amp_sum+0x00] + HADDW xmm1, xmm8, [amp_sum+0x10] + HADDW xmm2, xmm8, [amp_sum+0x20] + HADDW xmm3, xmm8, [amp_sum+0x30] + HADDW xmm4, xmm8, [amp_sum+0x40] + HADDW xmm5, xmm8, [amp_sum+0x50] + HADDW xmm6, xmm8, [amp_sum+0x60] + HADDW xmm7, xmm8, [amp_sum+0x70] + + psllq xmm3, 48+3 + psllq xmm7, 48+3 + psllq xmm2, 48+3 + psllq xmm6, 48+3 + psrlq xmm2, 16 + psrlq xmm6, 16 + psllq xmm1, 48+3 + psllq xmm5, 48+3 + psrlq xmm1, 32 + psrlq xmm5, 32 + psllq xmm0, 48+3 + psllq xmm4, 48+3 + psrlq xmm0, 48 + psrlq xmm4, 48 + por xmm0, xmm1 + por xmm4, xmm5 + por xmm0, xmm2 + por xmm4, xmm6 + por xmm0, xmm3 + por xmm4, xmm7 + + pshufd xmm1, xmm0, 01001110b + pshufd xmm2, xmm4, 01001110b + paddsw xmm1, xmm0 + paddsw xmm2, xmm4 + psrldq xmm1, 8 + pslldq xmm2, 8 + por xmm1, xmm2 + paddsw i8_sum, xmm1 + + por xmm0, [dw_0624 GLOBAL] + por xmm4, [dw_3715 GLOBAL] + pmaxsw xmm0, xmm4 + pshufd xmm1, xmm0, 10110001b + pmaxsw xmm0, xmm1 + movdqa xmm1, xmm0 + psrld xmm1, 16 + pmaxsw xmm0, xmm1 + movd [parm3q+0], xmm0 + psrldq xmm0, 8 + movd [parm3q+4], xmm0 + + test r11d, 1 + jz .oddy + ; find which direction was strongest for this 8x8 block + ; FIXME can occasionally require up to 14 bits for the sums, but only has 12 + ; if multiple directions saturate, we won't know which is strongest + psrlw i8_sum, 3 + psllw i8_sum, 3 + por i8_sum, xmm12 + pshufd xmm1, i8_sum, 1110b + pmaxsw i8_sum, xmm1 + pshuflw xmm1, i8_sum, 1110b + pmaxsw i8_sum, xmm1 + pshuflw xmm1, i8_sum, 1 + pmaxsw i8_sum, xmm1 + movd [parm4q], i8_sum + pxor i8_sum, i8_sum + add parm4q, 8 +.oddy: + + add parm3q, 16 + dec r11d + jnz .loopy + + sub parm3q, 56 + sub parm4q, 12 + mov rax, parm2q + shl rax, 4 + sub parm1q, rax + add parm1q, 8 + dec r10d + jnz .loopx + + EDGE_REMAP 16, (parm3q - 16) + EDGE_REMAP 4, (parm4q - 8) + + mov rsp, rbp + pop rbp + ret +%undef i8_sum +%undef amp_sum Index: tools/checkasm.c =================================================================== --- tools/checkasm.c (revision 516) +++ tools/checkasm.c (working copy) @@ -92,6 +92,24 @@ TEST_PIXEL_X(3); TEST_PIXEL_X(4); + + ok = 1; used_asm = 0; + { + int res4_c[16], res8_c[4], res4_a[16], res8_a[4]; + if( pixel_asm.edge_detect != pixel_ref.edge_detect ) + { + used_asm = 1; + pixel_c.edge_detect( buf1, 16, res4_c, res8_c ); + pixel_asm.edge_detect( buf1, 16, res4_a, res8_a ); + if( memcmp(res4_c, res4_a, sizeof(res4_c)) || + memcmp(res8_c, res8_a, sizeof(res8_c)) ) + { + ok = 0; + } + } + } + report( "edge_detect :" ); + return ret; }