Index: encoder/encoder.c
===================================================================
--- encoder/encoder.c	(revision 516)
+++ encoder/encoder.c	(working copy)
@@ -610,6 +610,10 @@
     for( i = 1; i < param->i_threads; i++ )
         h->thread[i] = x264_malloc( sizeof(x264_t) );
 
+#ifndef ARCH_X86_64
+    x264_log( h, X264_LOG_WARNING, "edge-detection is asm'ed only on x86_64 linux\n" );
+#endif
+
     return h;
 }
 
Index: encoder/analyse.c
===================================================================
--- encoder/analyse.c	(revision 516)
+++ encoder/analyse.c	(working copy)
@@ -82,6 +82,7 @@
     /* I: Intra part */
     /* Take some shortcuts in intra search if intra is deemed unlikely */
     int b_fast_intra;
+    int b_edge_intra;
     int b_try_pskip;
 
     /* Luma part */
@@ -92,9 +93,11 @@
     int i_satd_i8x8;
     int i_satd_i8x8_dir[12][4];
     int i_predict8x8[4];
+    int i_edge_i8x8[4];
 
     int i_satd_i4x4;
     int i_predict4x4[16];
+    int i_edge_i4x4[16];
 
     /* Chroma part */
     int i_satd_i8x8chroma;
@@ -373,27 +376,49 @@
 
 /* MAX = 9 */
 static void predict_4x4_mode_available( unsigned int i_neighbour,
-                                        int *mode, int *pi_count )
+                                        int *mode, int *pi_count, int edge )
 {
+    static const int dir_neighbors[][2] = {
+        [I_PRED_4x4_HU ]= { I_PRED_4x4_DDL, I_PRED_4x4_H },
+        [I_PRED_4x4_H  ]= { I_PRED_4x4_HU,  I_PRED_4x4_HD },
+        [I_PRED_4x4_HD ]= { I_PRED_4x4_H,   I_PRED_4x4_DDR },
+        [I_PRED_4x4_DDR]= { I_PRED_4x4_HD,  I_PRED_4x4_VR },
+        [I_PRED_4x4_VR ]= { I_PRED_4x4_DDR, I_PRED_4x4_V },
+        [I_PRED_4x4_V  ]= { I_PRED_4x4_VR,  I_PRED_4x4_VL },
+        [I_PRED_4x4_VL ]= { I_PRED_4x4_V,   I_PRED_4x4_DDL },
+        [I_PRED_4x4_DDL]= { I_PRED_4x4_VL,  I_PRED_4x4_HU }
+    };
     int b_l = i_neighbour & MB_LEFT;
     int b_t = i_neighbour & MB_TOP;
 
     if( b_l && b_t )
     {
-        *pi_count = 6;
-        *mode++ = I_PRED_4x4_DC;
-        *mode++ = I_PRED_4x4_H;
-        *mode++ = I_PRED_4x4_V;
-        *mode++ = I_PRED_4x4_DDL;
-        if( i_neighbour & MB_TOPLEFT )
+        int b_tl = i_neighbour & MB_TOPLEFT;
+        if( b_tl && edge >= 0 )
         {
-            *mode++ = I_PRED_4x4_DDR;
-            *mode++ = I_PRED_4x4_VR;
-            *mode++ = I_PRED_4x4_HD;
-            *pi_count += 3;
+            *pi_count = 4;
+            *mode++ = I_PRED_4x4_DC;
+            *mode++ = edge;
+            *mode++ = dir_neighbors[edge][0];
+            *mode++ = dir_neighbors[edge][1];
         }
-        *mode++ = I_PRED_4x4_VL;
-        *mode++ = I_PRED_4x4_HU;
+        else
+        {
+            *pi_count = 6;
+            *mode++ = I_PRED_4x4_DC;
+            *mode++ = I_PRED_4x4_H;
+            *mode++ = I_PRED_4x4_V;
+            *mode++ = I_PRED_4x4_DDL;
+            if( i_neighbour & MB_TOPLEFT )
+            {
+                *mode++ = I_PRED_4x4_DDR;
+                *mode++ = I_PRED_4x4_VR;
+                *mode++ = I_PRED_4x4_HD;
+                *pi_count += 3;
+            }
+            *mode++ = I_PRED_4x4_VL;
+            *mode++ = I_PRED_4x4_HU;
+        }
     }
     else if( b_l )
     {
@@ -493,6 +518,14 @@
     if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
         return;
 
+    a->b_edge_intra = !a->b_mbrd && (flags & X264_ANALYSE_I4x4);
+    if( a->b_edge_intra )
+    {
+        int stride = h->fenc->i_stride[0];
+        uint8_t *fenc = h->fenc->plane[0] + 16*(h->mb.i_mb_x+h->mb.i_mb_y*stride);
+        h->pixf.edge_detect( fenc, stride, a->i_edge_i4x4, a->i_edge_i8x8 );
+    }
+
     /* 8x8 prediction selection */
     if( flags & X264_ANALYSE_I8x8 )
     {
@@ -513,7 +546,8 @@
             int i_best = COST_MAX;
             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
 
-            predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
+            predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max,
+                                        a->b_edge_intra ? a->i_edge_i8x8[idx] : -1 );
             for( i = 0; i < i_max; i++ )
             {
                 int i_satd;
@@ -571,7 +605,8 @@
             int i_best = COST_MAX;
             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
 
-            predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
+            predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max,
+                                        a->b_edge_intra ? a->i_edge_i4x4[idx] : -1 );
 
             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
                 /* emulate missing topright samples */
@@ -681,7 +716,7 @@
 
             p_src_by = p_src + 4*x + 4*y*FENC_STRIDE;
             p_dst_by = p_dst + 4*x + 4*y*FDEC_STRIDE;
-            predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
+            predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max, -1 );
 
             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
                 /* emulate missing topright samples */
@@ -733,7 +768,7 @@
 
             p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
             p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
-            predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
+            predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max, -1 );
             for( i = 0; i < i_max; i++ )
             {
                 i_mode = predict_mode[i];
Index: common/i386/pixel.h
===================================================================
--- common/i386/pixel.h	(revision 516)
+++ common/i386/pixel.h	(working copy)
@@ -90,4 +90,6 @@
 int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int );
 int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int );
 
+void x264_pixel_edge_detect_sse2( uint8_t *pix, int stride, int *res4, int *res8 );
+
 #endif
Index: common/pixel.c
===================================================================
--- common/pixel.c	(revision 516)
+++ common/pixel.c	(working copy)
@@ -322,6 +322,66 @@
 SAD_X( 8x8_vis )
 #endif
 
+
+static void edge_detect( uint8_t *pix, int stride, int res4[16], int res8[4] )
+{
+#define TAN_1_16 13 // FIX6(0.198912), tan(pi*1/16)
+#define TAN_3_16 43 // FIX6(0.668179), tan(pi*3/16)
+    int x,y,i,idx;
+    int dir4[4][4][9] = {{{0}}};
+    int dir8[2][2][9] = {{{0}}};
+
+    for( y=0; y<16; y++ )
+        for( x=0; x<16; x++ )
+        {
+            uint8_t *p = &pix[x+y*stride];
+            int dx = p[0] - p[-1] + p[-stride] - p[-1-stride];
+            int dy = p[0] + p[-1] - p[-stride] - p[-1-stride];
+            int ax = abs(dx);
+            int ay = abs(dy);
+            int quadrant = (dx^dy) < 0;
+            int dir;
+
+            if( 64*ay < TAN_1_16*ax )
+                dir = I_PRED_4x4_V;
+            else if( 64*ax < TAN_1_16*ay )
+                dir = I_PRED_4x4_H;
+            else if( 64*ay < TAN_3_16*ax )
+                dir = quadrant ? I_PRED_4x4_VR : I_PRED_4x4_VL;
+            else if( 64*ax < TAN_3_16*ay )
+                dir = quadrant ? I_PRED_4x4_HD : I_PRED_4x4_HU;
+            else
+                dir = quadrant ? I_PRED_4x4_DDR : I_PRED_4x4_DDL;
+
+            dir4[y>>2][x>>2][dir] += ax+ay;
+            dir8[y>>3][x>>3][dir] += ax+ay;
+        }
+
+    for( idx=0; idx<16; idx++ )
+    {
+        int edge = -1, amp = -1;
+        for( i=0; i<9; i++ )
+            if( dir4[0][idx][i] > amp )
+            {
+                edge = i;
+                amp = dir4[0][idx][i];
+            }
+        res4[idx] = edge;
+    }
+    for( idx=0; idx<4; idx++ )
+    {
+        int edge = -1, amp = -1;
+        for( i=0; i<9; i++ )
+            if( dir8[0][idx][i] > amp )
+            {
+                edge = i;
+                amp = dir8[0][idx][i];
+            }
+        res8[idx] = edge;
+    }
+}
+
+
 /****************************************************************************
  * x264_pixel_init:
  ****************************************************************************/
@@ -349,6 +409,8 @@
     pixf->sa8d[PIXEL_8x16] = x264_pixel_sa8d_8x16;
     pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8;
 
+    pixf->edge_detect = edge_detect;
+
 #ifdef HAVE_MMXEXT
     if( cpu&X264_CPU_MMX )
     {
@@ -403,6 +465,8 @@
 #ifdef ARCH_X86_64
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_sse2;
+
+        pixf->edge_detect = x264_pixel_edge_detect_sse2;
 #endif
     }
 #endif
Index: common/pixel.h
===================================================================
--- common/pixel.h	(revision 516)
+++ common/pixel.h	(working copy)
@@ -78,6 +78,8 @@
     /* multiple parallel calls to sad. */
     x264_pixel_cmp_x3_t sad_x3[7];
     x264_pixel_cmp_x4_t sad_x4[7];
+
+    void (*edge_detect)( uint8_t *pix, int stride, int *res4, int *res8 );
 } x264_pixel_function_t;
 
 void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
Index: common/amd64/pixel-sse2.asm
===================================================================
--- common/amd64/pixel-sse2.asm	(revision 516)
+++ common/amd64/pixel-sse2.asm	(working copy)
@@ -30,8 +30,33 @@
 
 SECTION .rodata align=16
 
+pw_000d: times 8 dw 0x000d
+pw_002b: times 8 dw 0x002b
 pd_0000ffff: times 4 dd 0x0000ffff
+dw_0624:
+%rep 2
+    dw 0
+    dw 6
+    dw 2
+    dw 4
+%endrep
+dw_3715:
+%rep 2
+    dw 3
+    dw 7
+    dw 1
+    dw 5
+%endrep
 
+edge_map:
+    dd 8 ; HU
+    dd 7 ; VL
+    dd 6 ; HD
+    dd 5 ; VR
+    dd 4 ; DDR
+    dd 3 ; DDL
+    dd 1 ; H
+    dd 0 ; V
 
 SECTION .text
 
@@ -47,6 +72,7 @@
 cglobal x264_pixel_satd_16x16_sse2
 cglobal x264_pixel_sa8d_8x8_sse2
 cglobal x264_pixel_sa8d_16x16_sse2
+cglobal x264_pixel_edge_detect_sse2
 
 %macro SAD_INC_4x16P_SSE2 0
     movdqu  xmm1,   [rdx]
@@ -615,3 +641,247 @@
     add  eax, 1
     shr  eax, 1
     ret
+
+
+
+%macro HADDW 3 ; dst, tmp, mem
+    movdqa  %1, %3
+    pshufd  %2, %1, 10110001b
+    paddw   %1, %2
+    movdqa  %2, %1
+    psrld   %2, 16
+    paddw   %1, %2
+%endmacro
+
+%macro EDGE_REMAP 2
+    mov  eax, %1/2
+.remap%1:
+    mov  edi, [%2 + 8*rax - 8]
+    mov  esi, [%2 + 8*rax - 4]
+    and  edi, 7
+    and  esi, 7
+    mov  edi, [edge_map + 4*rdi GLOBAL]
+    mov  esi, [edge_map + 4*rsi GLOBAL]
+    mov  [%2 + 8*rax - 8], edi
+    mov  [%2 + 8*rax - 4], esi
+    dec  eax
+    jnz .remap%1
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void x264_pixel_edge_detect_sse2( uint8_t *pix, int stride, int *res4, int *res8 )
+;-----------------------------------------------------------------------------
+x264_pixel_edge_detect_sse2:
+    push rbp
+    mov  rbp, rsp
+    sub  rsp, 0x80
+    and  rsp, -16
+    sub  parm1q, parm2q
+%define  amp_sum rsp
+%define  i8_sum  xmm11
+    ; { HU, H, HD, DDR, VR, V, VL, DDL }
+    ;    0, 6,  2,   4,  3, 7,  1,   5
+    pxor      xmm15, xmm15
+    movdqa    xmm14, [pw_000d GLOBAL] ; 13 = 64*tan(pi*1/16)
+    movdqa    xmm13, [pw_002b GLOBAL] ; 43 = 64*tan(pi*3/16)
+    movdqu    xmm12, [dw_0624+8 GLOBAL]
+    pxor     i8_sum, i8_sum
+
+    mov  r10d, 2
+.loopx:
+    mov  r11d, 4
+.loopy:
+    movdqa    [amp_sum+0x00], xmm15
+    movdqa    [amp_sum+0x10], xmm15
+    movdqa    [amp_sum+0x20], xmm15
+    movdqa    [amp_sum+0x30], xmm15
+    movdqa    [amp_sum+0x40], xmm15
+    movdqa    [amp_sum+0x50], xmm15
+    movdqa    [amp_sum+0x60], xmm15
+    movdqa    [amp_sum+0x70], xmm15
+    mov  eax, 4
+.loop4x4:
+
+    ; calculate gradient
+    movq      xmm0, [parm1q+parm2q]
+    movq      xmm1, [parm1q+parm2q-1]
+    movq      xmm2, [parm1q]
+    movq      xmm3, [parm1q-1]
+    punpcklbw xmm0, xmm15
+    punpcklbw xmm1, xmm15
+    punpcklbw xmm2, xmm15
+    punpcklbw xmm3, xmm15
+    psubw     xmm0, xmm3
+    psubw     xmm1, xmm2
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1 ; a+b-c-d
+    psubw     xmm1, xmm2 ; c-b+c-d
+    movdqa    xmm9, xmm0
+    pxor      xmm9, xmm1
+    psraw     xmm9, 15   ; quadrant
+    pxor      xmm2, xmm2
+    pxor      xmm3, xmm3
+    psubw     xmm2, xmm0
+    psubw     xmm3, xmm1
+    pmaxsw    xmm0, xmm2 ; abs(dI/dy)
+    pmaxsw    xmm1, xmm3 ; abs(dI/dx)
+    movdqa    xmm8, xmm0
+    paddw     xmm8, xmm1 ; amp
+    pcmpeqb   xmm7, xmm7 ; ifelse mask
+
+    ; classify gradient into one of 8 directions,
+    ; and sum the magnitudes of each direction
+    ; 9.6 fixed-point math
+    movdqa    xmm2, xmm1
+    movdqa    xmm3, xmm0
+    movdqa    xmm4, xmm1
+    movdqa    xmm5, xmm0
+    pmullw    xmm2, xmm14
+    pmullw    xmm3, xmm14
+    pmullw    xmm4, xmm13
+    pmullw    xmm5, xmm13
+    psllw     xmm0, 6
+    psllw     xmm1, 6
+    pcmpgtw   xmm2, xmm0 ; ay < ax*tan(pi*1/16)
+    pcmpgtw   xmm3, xmm1 ; ax < ay*tan(pi*1/16)
+    pcmpgtw   xmm4, xmm0 ; ay < ax*tan(pi*3/16)
+    pcmpgtw   xmm5, xmm1 ; ax < ay*tan(pi*3/16)
+
+    pxor      xmm7, xmm2
+    pand      xmm2, xmm8
+    paddw     xmm2, [amp_sum+0x50]
+    movdqa    [amp_sum+0x50], xmm2 ; V
+
+    pand      xmm3, xmm7
+    pxor      xmm7, xmm3
+    pand      xmm3, xmm8
+    paddw     xmm3, [amp_sum+0x10]
+    movdqa    [amp_sum+0x10], xmm3 ; H
+
+    pand      xmm4, xmm7
+    pxor      xmm7, xmm4
+    pand      xmm4, xmm8
+    movdqa    xmm0, xmm9
+    pandn     xmm0, xmm4
+    pand      xmm4, xmm9
+    paddw     xmm4, [amp_sum+0x60]
+    paddw     xmm0, [amp_sum+0x40]
+    movdqa    [amp_sum+0x60], xmm4 ; VL
+    movdqa    [amp_sum+0x40], xmm0 ; VR
+
+    pand      xmm5, xmm7
+    pxor      xmm7, xmm5
+    pand      xmm5, xmm8
+    movdqa    xmm0, xmm9
+    pandn     xmm0, xmm5
+    pand      xmm5, xmm9
+    paddw     xmm5, [amp_sum+0x00]
+    paddw     xmm0, [amp_sum+0x20]
+    movdqa    [amp_sum+0x00], xmm5 ; HU
+    movdqa    [amp_sum+0x20], xmm0 ; HD
+
+    pand      xmm7, xmm8
+    pand      xmm9, xmm7
+    pxor      xmm7, xmm9
+    paddw     xmm9, [amp_sum+0x70]
+    paddw     xmm7, [amp_sum+0x30]
+    movdqa    [amp_sum+0x70], xmm9 ; DDL
+    movdqa    [amp_sum+0x30], xmm7 ; DDR
+
+    add  parm1q, parm2q
+    dec  eax
+    jnz  .loop4x4
+
+    ; find which directions were strongest for these two 4x4 blocks
+    HADDW     xmm0, xmm8, [amp_sum+0x00]
+    HADDW     xmm1, xmm8, [amp_sum+0x10]
+    HADDW     xmm2, xmm8, [amp_sum+0x20]
+    HADDW     xmm3, xmm8, [amp_sum+0x30]
+    HADDW     xmm4, xmm8, [amp_sum+0x40]
+    HADDW     xmm5, xmm8, [amp_sum+0x50]
+    HADDW     xmm6, xmm8, [amp_sum+0x60]
+    HADDW     xmm7, xmm8, [amp_sum+0x70]
+
+    psllq     xmm3, 48+3
+    psllq     xmm7, 48+3
+    psllq     xmm2, 48+3
+    psllq     xmm6, 48+3
+    psrlq     xmm2, 16
+    psrlq     xmm6, 16
+    psllq     xmm1, 48+3
+    psllq     xmm5, 48+3
+    psrlq     xmm1, 32
+    psrlq     xmm5, 32
+    psllq     xmm0, 48+3
+    psllq     xmm4, 48+3
+    psrlq     xmm0, 48
+    psrlq     xmm4, 48
+    por       xmm0, xmm1
+    por       xmm4, xmm5
+    por       xmm0, xmm2
+    por       xmm4, xmm6
+    por       xmm0, xmm3
+    por       xmm4, xmm7
+
+    pshufd    xmm1, xmm0, 01001110b
+    pshufd    xmm2, xmm4, 01001110b
+    paddsw    xmm1, xmm0
+    paddsw    xmm2, xmm4
+    psrldq    xmm1, 8
+    pslldq    xmm2, 8
+    por       xmm1, xmm2
+    paddsw  i8_sum, xmm1
+
+    por       xmm0, [dw_0624 GLOBAL]
+    por       xmm4, [dw_3715 GLOBAL]
+    pmaxsw    xmm0, xmm4
+    pshufd    xmm1, xmm0, 10110001b
+    pmaxsw    xmm0, xmm1
+    movdqa    xmm1, xmm0
+    psrld     xmm1, 16
+    pmaxsw    xmm0, xmm1
+    movd  [parm3q+0], xmm0
+    psrldq    xmm0, 8
+    movd  [parm3q+4], xmm0
+
+    test r11d, 1
+    jz   .oddy
+    ; find which direction was strongest for this 8x8 block
+    ; FIXME can occasionally require up to 14 bits for the sums, but only has 12
+    ; if multiple directions saturate, we won't know which is strongest
+    psrlw   i8_sum, 3
+    psllw   i8_sum, 3
+    por     i8_sum, xmm12
+    pshufd    xmm1, i8_sum, 1110b
+    pmaxsw  i8_sum, xmm1
+    pshuflw   xmm1, i8_sum, 1110b
+    pmaxsw  i8_sum, xmm1
+    pshuflw   xmm1, i8_sum, 1
+    pmaxsw  i8_sum, xmm1
+    movd  [parm4q], i8_sum
+    pxor    i8_sum, i8_sum
+    add  parm4q, 8
+.oddy:
+
+    add  parm3q, 16
+    dec  r11d
+    jnz  .loopy
+
+    sub  parm3q, 56
+    sub  parm4q, 12
+    mov  rax, parm2q
+    shl  rax, 4
+    sub  parm1q, rax
+    add  parm1q, 8
+    dec  r10d
+    jnz  .loopx
+
+    EDGE_REMAP 16, (parm3q - 16)
+    EDGE_REMAP 4,  (parm4q - 8)
+
+    mov  rsp, rbp
+    pop  rbp
+    ret
+%undef i8_sum
+%undef amp_sum
Index: tools/checkasm.c
===================================================================
--- tools/checkasm.c	(revision 516)
+++ tools/checkasm.c	(working copy)
@@ -92,6 +92,24 @@
 
     TEST_PIXEL_X(3);
     TEST_PIXEL_X(4);
+
+    ok = 1; used_asm = 0;
+    {
+        int res4_c[16], res8_c[4], res4_a[16], res8_a[4];
+        if( pixel_asm.edge_detect != pixel_ref.edge_detect )
+        {
+            used_asm = 1;
+            pixel_c.edge_detect( buf1, 16, res4_c, res8_c );
+            pixel_asm.edge_detect( buf1, 16, res4_a, res8_a );
+            if( memcmp(res4_c, res4_a, sizeof(res4_c)) ||
+                memcmp(res8_c, res8_a, sizeof(res8_c)) )
+            {
+                ok = 0;
+            }
+        }
+    }
+    report( "edge_detect :" );
+
     return ret;
 }