diff --git a/common/bs.h b/common/bs.h index d5db977..e1dcafb 100644 --- a/common/bs.h +++ b/common/bs.h @@ -88,7 +88,7 @@ static inline int bs_pos( bs_t *s ) /* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */ static inline void bs_flush( bs_t *s ) { - *(uint32_t*)s->p = endian_fix32( s->cur_bits << (s->i_left&31) ); + M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) ); s->p += WORD_SIZE - s->i_left / 8; s->i_left = WORD_SIZE*8; } @@ -102,9 +102,9 @@ static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits ) if( s->i_left <= 32 ) { #ifdef WORDS_BIGENDIAN - *(uint32_t*)s->p = s->cur_bits >> (32 - s->i_left); + M32( s->p ) = s->cur_bits >> (32 - s->i_left); #else - *(uint32_t*)s->p = endian_fix( s->cur_bits << s->i_left ); + M32( s->p ) = endian_fix( s->cur_bits << s->i_left ); #endif s->i_left += 32; s->p += 4; @@ -121,7 +121,7 @@ static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits ) { i_count -= s->i_left; s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count); - *(uint32_t*)s->p = endian_fix( s->cur_bits ); + M32( s->p ) = endian_fix( s->cur_bits ); s->p += 4; s->cur_bits = i_bits; s->i_left = 32 - i_count; @@ -144,7 +144,7 @@ static inline void bs_write1( bs_t *s, uint32_t i_bit ) s->i_left--; if( s->i_left == WORD_SIZE*8-32 ) { - *(uint32_t*)s->p = endian_fix32( s->cur_bits ); + M32( s->p ) = endian_fix32( s->cur_bits ); s->p += 4; s->i_left = WORD_SIZE*8; } diff --git a/common/common.h b/common/common.h index 8bd71d3..3ea5155 100644 --- a/common/common.h +++ b/common/common.h @@ -78,6 +78,21 @@ do {\ #include #include #include + +/* Unions for type-punning without aliasing violations. + * Mn: load or store n bits, aligned, native-endian + * CPn: copy n bits, aligned, native-endian + * we don't use memcpy for CPn because memcpy's args aren't assumed to be aligned */ +typedef union { uint16_t i; uint8_t c[2]; } x264_union16_t; +typedef union { uint32_t i; uint16_t b[2]; uint8_t c[4]; } x264_union32_t; +typedef union { uint64_t i; uint32_t a[2]; uint16_t b[4]; uint8_t c[8]; } x264_union64_t; +#define M16(src) (((x264_union16_t*)(src))->i) +#define M32(src) (((x264_union32_t*)(src))->i) +#define M64(src) (((x264_union64_t*)(src))->i) +#define CP16(dst,src) M16(dst) = M16(src) +#define CP32(dst,src) M32(dst) = M32(src) +#define CP64(dst,src) M64(dst) = M64(src) + #include "x264.h" #include "bs.h" #include "set.h" diff --git a/common/dct.c b/common/dct.c index 0aed8d0..245347b 100644 --- a/common/dct.c +++ b/common/dct.c @@ -607,11 +607,11 @@ static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[16] ) static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[16] ) { - *(uint32_t*)level = *(uint32_t*)dct; + CP32( level, dct ); ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1) - *(uint32_t*)(level+6) = *(uint32_t*)(dct+6); - *(uint64_t*)(level+8) = *(uint64_t*)(dct+8); - *(uint64_t*)(level+12) = *(uint64_t*)(dct+12); + CP32( level+6, dct+6 ); + CP64( level+8, dct+8 ); + CP64( level+12, dct+12 ); } #undef ZIG @@ -622,19 +622,19 @@ static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[16] ) nz |= level[i];\ } #define COPY4x4\ - *(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\ - *(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\ - *(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\ - *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE); + CP32( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\ + CP32( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\ + CP32( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\ + CP32( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE ); #define COPY8x8\ - *(uint64_t*)(p_dst+0*FDEC_STRIDE) = *(uint64_t*)(p_src+0*FENC_STRIDE);\ - *(uint64_t*)(p_dst+1*FDEC_STRIDE) = *(uint64_t*)(p_src+1*FENC_STRIDE);\ - *(uint64_t*)(p_dst+2*FDEC_STRIDE) = *(uint64_t*)(p_src+2*FENC_STRIDE);\ - *(uint64_t*)(p_dst+3*FDEC_STRIDE) = *(uint64_t*)(p_src+3*FENC_STRIDE);\ - *(uint64_t*)(p_dst+4*FDEC_STRIDE) = *(uint64_t*)(p_src+4*FENC_STRIDE);\ - *(uint64_t*)(p_dst+5*FDEC_STRIDE) = *(uint64_t*)(p_src+5*FENC_STRIDE);\ - *(uint64_t*)(p_dst+6*FDEC_STRIDE) = *(uint64_t*)(p_src+6*FENC_STRIDE);\ - *(uint64_t*)(p_dst+7*FDEC_STRIDE) = *(uint64_t*)(p_src+7*FENC_STRIDE); + CP64( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\ + CP64( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\ + CP64( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\ + CP64( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\ + CP64( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\ + CP64( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\ + CP64( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\ + CP64( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE ); static int zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst ) { diff --git a/common/frame.c b/common/frame.c index ce58b34..d4d68bd 100644 --- a/common/frame.c +++ b/common/frame.c @@ -728,10 +728,10 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) {\ /* *** Get bS for each 4px for the current edge *** */\ if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\ - *(uint32_t*)bS = 0x03030303;\ + M32( bS ) = 0x03030303;\ else\ {\ - *(uint32_t*)bS = 0x00000000;\ + M32( bS ) = 0x00000000;\ for( i = 0; i < 4; i++ )\ {\ int x = i_dir == 0 ? i_edge : i;\ @@ -805,7 +805,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) goto end##i_dir;\ }\ DEBLOCK_STRENGTH(i_dir);\ - if( *(uint32_t*)bS )\ + if( M32( bS ) )\ FILTER_DIR( , i_dir);\ end##i_dir:\ i_edge += b_8x8_transform+1;\ @@ -816,7 +816,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\ {\ DEBLOCK_STRENGTH(i_dir);\ - if( *(uint32_t*)bS )\ + if( M32( bS ) )\ FILTER_DIR( , i_dir);\ }\ } diff --git a/common/macroblock.c b/common/macroblock.c index 356a839..c27430e 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -50,7 +50,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv { if( i_refb == i_ref ) { - *(uint32_t*)mvp = *(uint32_t*)mv_b; + CP32( mvp, mv_b ); return; } } @@ -58,7 +58,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv { if( i_refa == i_ref ) { - *(uint32_t*)mvp = *(uint32_t*)mv_a; + CP32( mvp, mv_a ); return; } } @@ -69,7 +69,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv { if( i_refa == i_ref ) { - *(uint32_t*)mvp = *(uint32_t*)mv_a; + CP32( mvp, mv_a ); return; } } @@ -77,7 +77,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv { if( i_refc == i_ref ) { - *(uint32_t*)mvp = *(uint32_t*)mv_c; + CP32( mvp, mv_c ); return; } } @@ -95,14 +95,14 @@ median: else if( i_count == 1 ) { if( i_refa == i_ref ) - *(uint32_t*)mvp = *(uint32_t*)mv_a; + CP32( mvp, mv_a ); else if( i_refb == i_ref ) - *(uint32_t*)mvp = *(uint32_t*)mv_b; + CP32( mvp, mv_b ); else - *(uint32_t*)mvp = *(uint32_t*)mv_c; + CP32( mvp, mv_c ); } else if( i_refb == -2 && i_refc == -2 && i_refa != -2 ) - *(uint32_t*)mvp = *(uint32_t*)mv_a; + CP32( mvp, mv_a ); else goto median; } @@ -136,14 +136,14 @@ median: else if( i_count == 1 ) { if( i_refa == i_ref ) - *(uint32_t*)mvp = *(uint32_t*)mv_a; + CP32( mvp, mv_a ); else if( i_refb == i_ref ) - *(uint32_t*)mvp = *(uint32_t*)mv_b; + CP32( mvp, mv_b ); else - *(uint32_t*)mvp = *(uint32_t*)mv_c; + CP32( mvp, mv_c ); } else if( i_refb == -2 && i_refc == -2 && i_refa != -2 ) - *(uint32_t*)mvp = *(uint32_t*)mv_a; + CP32( mvp, mv_a ); else goto median; } @@ -157,10 +157,10 @@ void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] ) int16_t *mv_b = h->mb.cache.mv[0][X264_SCAN8_0 - 8]; if( i_refa == -2 || i_refb == -2 || - !( i_refa | *(uint32_t*)mv_a ) || - !( i_refb | *(uint32_t*)mv_b ) ) + !( i_refa | M32( mv_a ) ) || + !( i_refb | M32( mv_b ) ) ) { - *(uint32_t*)mv = 0; + M32( mv ) = 0; } else { @@ -259,17 +259,12 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h ) if( ref[0] >= 0 ) x264_mb_predict_mv_16x16( h, 0, ref[0], mv[0] ); else - { - mv[0][0] = 0; - mv[0][1] = 0; - } + M32( mv[0] ) = 0; + if( ref[1] >= 0 ) x264_mb_predict_mv_16x16( h, 1, ref[1], mv[1] ); else - { - mv[1][0] = 0; - mv[1][1] = 0; - } + M32( mv[1] ) = 0; x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, ref[0] ); x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, ref[1] ); @@ -336,8 +331,8 @@ int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed ) { *b_changed = h->mb.cache.direct_ref[0][0] != h->mb.cache.ref[0][X264_SCAN8_0] || h->mb.cache.direct_ref[1][0] != h->mb.cache.ref[1][X264_SCAN8_0] - || *(uint32_t*)h->mb.cache.direct_mv[0][X264_SCAN8_0] != *(uint32_t*)h->mb.cache.mv[0][X264_SCAN8_0] - || *(uint32_t*)h->mb.cache.direct_mv[1][X264_SCAN8_0] != *(uint32_t*)h->mb.cache.mv[1][X264_SCAN8_0]; + || M32( h->mb.cache.direct_mv[0][X264_SCAN8_0] ) != M32( h->mb.cache.mv[0][X264_SCAN8_0] ) + || M32( h->mb.cache.direct_mv[1][X264_SCAN8_0] ) != M32( h->mb.cache.mv[1][X264_SCAN8_0] ); } else { @@ -371,14 +366,10 @@ void x264_mb_load_mv_direct8x8( x264_t *h, int idx ) const int y = 2*(idx/2); x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] ); x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] ); - *(uint64_t*)h->mb.cache.mv[0][x264_scan8[idx*4]] = - *(uint64_t*)h->mb.cache.direct_mv[0][x264_scan8[idx*4]]; - *(uint64_t*)h->mb.cache.mv[0][x264_scan8[idx*4]+8] = - *(uint64_t*)h->mb.cache.direct_mv[0][x264_scan8[idx*4]+8]; - *(uint64_t*)h->mb.cache.mv[1][x264_scan8[idx*4]] = - *(uint64_t*)h->mb.cache.direct_mv[1][x264_scan8[idx*4]]; - *(uint64_t*)h->mb.cache.mv[1][x264_scan8[idx*4]+8] = - *(uint64_t*)h->mb.cache.direct_mv[1][x264_scan8[idx*4]+8]; + CP64( h->mb.cache.mv[0][x264_scan8[idx*4]+0], h->mb.cache.direct_mv[0][x264_scan8[idx*4]+0] ); + CP64( h->mb.cache.mv[0][x264_scan8[idx*4]+8], h->mb.cache.direct_mv[0][x264_scan8[idx*4]+8] ); + CP64( h->mb.cache.mv[1][x264_scan8[idx*4]+0], h->mb.cache.direct_mv[1][x264_scan8[idx*4]+0] ); + CP64( h->mb.cache.mv[1][x264_scan8[idx*4]+8], h->mb.cache.direct_mv[1][x264_scan8[idx*4]+8] ); } /* This just improves encoder performance, it's not part of the spec */ @@ -388,7 +379,7 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[ int i = 0; #define SET_MVP(mvp) { \ - *(uint32_t*)mvc[i] = *(uint32_t*)mvp; \ + CP32( mvc[i], mvp ); \ i++; \ } @@ -403,7 +394,11 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[ { int16_t (*lowres_mv)[2] = i_list ? h->fenc->lowres_mvs[1][h->fref1[0]->i_frame-h->fenc->i_frame-1] : h->fenc->lowres_mvs[0][h->fenc->i_frame-h->fref0[0]->i_frame-1]; - if( lowres_mv[0][0] != 0x7fff ) *(uint32_t*)mvc[i++] = (*(uint32_t*)lowres_mv[h->mb.i_mb_xy]*2)&0xfffeffff; + if( lowres_mv[0][0] != 0x7fff ) + { + M32( mvc[i] ) = (M32( lowres_mv[h->mb.i_mb_xy] )*2)&0xfffeffff; + i++; + } } /* spatial predictors */ @@ -982,13 +977,13 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) h->mb.i_neighbour_intra |= MB_TOP; /* load intra4x4 */ - *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] = *(uint32_t*)&h->mb.intra4x4_pred_mode[i_top_xy][0]; + CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &h->mb.intra4x4_pred_mode[i_top_xy][0] ); /* load non_zero_count */ - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0] - 8] = *(uint32_t*)&h->mb.non_zero_count[i_top_xy][12]; + CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &h->mb.non_zero_count[i_top_xy][12] ); /* shift because x264_scan8[16] is misaligned */ - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16] - 9] = *(uint16_t*)&h->mb.non_zero_count[i_top_xy][18] << 8; - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] = *(uint16_t*)&h->mb.non_zero_count[i_top_xy][22] << 8; + M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = M16( &h->mb.non_zero_count[i_top_xy][18] ) << 8; + M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = M16( &h->mb.non_zero_count[i_top_xy][22] ) << 8; } else { @@ -996,12 +991,12 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) h->mb.cache.i_cbp_top = -1; /* load intra4x4 */ - *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] = 0xFFFFFFFFU; + M32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] ) = 0xFFFFFFFFU; /* load non_zero_count */ - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0] - 8] = - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] = - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] = 0x80808080U; + M32( &h->mb.cache.non_zero_count[x264_scan8[ 0] - 8] ) = 0x80808080U; + M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = 0x80808080U; + M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = 0x80808080U; } if( i_mb_x > 0 && i_mb_xy > h->sh.i_first_mb ) @@ -1136,13 +1131,13 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) const int ir = i_top_8x8 - 1; const int iv = i_top_4x4 - 1; h->mb.cache.ref[i_list][i8] = h->mb.ref[i_list][ir]; - *(uint32_t*)h->mb.cache.mv[i_list][i8] = *(uint32_t*)h->mb.mv[i_list][iv]; + CP32( h->mb.cache.mv[i_list][i8], h->mb.mv[i_list][iv] ); } else { const int i8 = x264_scan8[0] - 1 - 1*8; h->mb.cache.ref[i_list][i8] = -2; - *(uint32_t*)h->mb.cache.mv[i_list][i8] = 0; + M32( h->mb.cache.mv[i_list][i8] ) = 0; } if( h->mb.i_neighbour & MB_TOP ) @@ -1154,15 +1149,15 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) h->mb.cache.ref[i_list][i8+1] = h->mb.ref[i_list][ir + 0]; h->mb.cache.ref[i_list][i8+2] = h->mb.cache.ref[i_list][i8+3] = h->mb.ref[i_list][ir + 1]; - *(uint64_t*)h->mb.cache.mv[i_list][i8+0] = *(uint64_t*)h->mb.mv[i_list][iv+0]; - *(uint64_t*)h->mb.cache.mv[i_list][i8+2] = *(uint64_t*)h->mb.mv[i_list][iv+2]; + CP64( h->mb.cache.mv[i_list][i8+0], h->mb.mv[i_list][iv+0] ); + CP64( h->mb.cache.mv[i_list][i8+2], h->mb.mv[i_list][iv+2] ); } else { const int i8 = x264_scan8[0] - 8; - *(uint64_t*)h->mb.cache.mv[i_list][i8+0] = 0; - *(uint64_t*)h->mb.cache.mv[i_list][i8+2] = 0; - *(uint32_t*)&h->mb.cache.ref[i_list][i8] = (uint8_t)(-2) * 0x01010101U; + M64( h->mb.cache.mv[i_list][i8+0] ) = 0; + M64( h->mb.cache.mv[i_list][i8+2] ) = 0; + M32( &h->mb.cache.ref[i_list][i8] ) = (uint8_t)(-2) * 0x01010101U; } if( h->mb.i_neighbour & MB_TOPRIGHT ) @@ -1171,13 +1166,13 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) const int ir = i_top_8x8 + 2; const int iv = i_top_4x4 + 4; h->mb.cache.ref[i_list][i8] = h->mb.ref[i_list][ir]; - *(uint32_t*)h->mb.cache.mv[i_list][i8] = *(uint32_t*)h->mb.mv[i_list][iv]; + CP32( h->mb.cache.mv[i_list][i8], h->mb.mv[i_list][iv] ); } else { const int i8 = x264_scan8[0] + 4 - 1*8; h->mb.cache.ref[i_list][i8] = -2; - *(uint32_t*)h->mb.cache.mv[i_list][i8] = 0; + M32( h->mb.cache.mv[i_list][i8] ) = 0; } if( h->mb.i_neighbour & MB_LEFT ) @@ -1190,10 +1185,10 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) h->mb.cache.ref[i_list][i8+2*8] = h->mb.cache.ref[i_list][i8+3*8] = h->mb.ref[i_list][ir + 1*s8x8]; - *(uint32_t*)h->mb.cache.mv[i_list][i8+0*8] = *(uint32_t*)h->mb.mv[i_list][iv + 0*s4x4]; - *(uint32_t*)h->mb.cache.mv[i_list][i8+1*8] = *(uint32_t*)h->mb.mv[i_list][iv + 1*s4x4]; - *(uint32_t*)h->mb.cache.mv[i_list][i8+2*8] = *(uint32_t*)h->mb.mv[i_list][iv + 2*s4x4]; - *(uint32_t*)h->mb.cache.mv[i_list][i8+3*8] = *(uint32_t*)h->mb.mv[i_list][iv + 3*s4x4]; + CP32( h->mb.cache.mv[i_list][i8+0*8], h->mb.mv[i_list][iv + 0*s4x4] ); + CP32( h->mb.cache.mv[i_list][i8+1*8], h->mb.mv[i_list][iv + 1*s4x4] ); + CP32( h->mb.cache.mv[i_list][i8+2*8], h->mb.mv[i_list][iv + 2*s4x4] ); + CP32( h->mb.cache.mv[i_list][i8+3*8], h->mb.mv[i_list][iv + 3*s4x4] ); } else { @@ -1201,7 +1196,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) for( i = 0; i < 4; i++ ) { h->mb.cache.ref[i_list][i8+i*8] = -2; - *(uint32_t*)h->mb.cache.mv[i_list][i8+i*8] = 0; + M32( h->mb.cache.mv[i_list][i8+i*8] ) = 0; } } @@ -1211,30 +1206,30 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) { const int i8 = x264_scan8[0] - 8; const int iv = i_top_4x4; - *(uint64_t*)h->mb.cache.mvd[i_list][i8+0] = *(uint64_t*)h->mb.mvd[i_list][iv+0]; - *(uint64_t*)h->mb.cache.mvd[i_list][i8+2] = *(uint64_t*)h->mb.mvd[i_list][iv+2]; + CP64( h->mb.cache.mvd[i_list][i8+0], h->mb.mvd[i_list][iv+0] ); + CP64( h->mb.cache.mvd[i_list][i8+2], h->mb.mvd[i_list][iv+2] ); } else { const int i8 = x264_scan8[0] - 8; - *(uint64_t*)h->mb.cache.mvd[i_list][i8+0] = - *(uint64_t*)h->mb.cache.mvd[i_list][i8+2] = 0; + M64( h->mb.cache.mvd[i_list][i8+0] ) = 0; + M64( h->mb.cache.mvd[i_list][i8+2] ) = 0; } if( i_left_type >= 0 ) { const int i8 = x264_scan8[0] - 1; const int iv = i_mb_4x4 - 1; - *(uint32_t*)h->mb.cache.mvd[i_list][i8+0*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 0*s4x4]; - *(uint32_t*)h->mb.cache.mvd[i_list][i8+1*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 1*s4x4]; - *(uint32_t*)h->mb.cache.mvd[i_list][i8+2*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 2*s4x4]; - *(uint32_t*)h->mb.cache.mvd[i_list][i8+3*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 3*s4x4]; + CP32( h->mb.cache.mvd[i_list][i8+0*8], h->mb.mvd[i_list][iv + 0*s4x4] ); + CP32( h->mb.cache.mvd[i_list][i8+1*8], h->mb.mvd[i_list][iv + 1*s4x4] ); + CP32( h->mb.cache.mvd[i_list][i8+2*8], h->mb.mvd[i_list][iv + 2*s4x4] ); + CP32( h->mb.cache.mvd[i_list][i8+3*8], h->mb.mvd[i_list][iv + 3*s4x4] ); } else { const int i8 = x264_scan8[0] - 1; for( i = 0; i < 4; i++ ) - *(uint32_t*)h->mb.cache.mvd[i_list][i8+i*8] = 0; + M32( h->mb.cache.mvd[i_list][i8+i*8] ) = 0; } } } @@ -1311,15 +1306,15 @@ void x264_macroblock_cache_save( x264_t *h ) /* save intra4x4 */ if( i_mb_type == I_4x4 ) { - *(uint32_t*)&intra4x4_pred_mode[0] = *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[10] ]; - *(uint32_t*)&intra4x4_pred_mode[4] = pack8to32(h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ], - h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ], - h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0); + CP32( &intra4x4_pred_mode[0], &h->mb.cache.intra4x4_pred_mode[x264_scan8[10]] ); + M32( &intra4x4_pred_mode[4] ) = pack8to32(h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ], + h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ], + h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0); } else if( !h->param.b_constrained_intra || IS_INTRA(i_mb_type) ) - *(uint64_t*)intra4x4_pred_mode = I_PRED_4x4_DC * 0x0101010101010101ULL; + M64( intra4x4_pred_mode ) = I_PRED_4x4_DC * 0x0101010101010101ULL; else - *(uint64_t*)intra4x4_pred_mode = (uint8_t)(-1) * 0x0101010101010101ULL; + M64( intra4x4_pred_mode ) = (uint8_t)(-1) * 0x0101010101010101ULL; if( i_mb_type == I_PCM ) @@ -1335,14 +1330,14 @@ void x264_macroblock_cache_save( x264_t *h ) else { /* save non zero count */ - *(uint32_t*)&non_zero_count[0*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+0*8]; - *(uint32_t*)&non_zero_count[1*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+1*8]; - *(uint32_t*)&non_zero_count[2*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+2*8]; - *(uint32_t*)&non_zero_count[3*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+3*8]; - *(uint16_t*)&non_zero_count[16+0*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] >> 8; - *(uint16_t*)&non_zero_count[16+1*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] >> 8; - *(uint16_t*)&non_zero_count[16+2*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] >> 8; - *(uint16_t*)&non_zero_count[16+3*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] >> 8; + CP32( &non_zero_count[0*4], &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] ); + CP32( &non_zero_count[1*4], &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] ); + CP32( &non_zero_count[2*4], &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] ); + CP32( &non_zero_count[3*4], &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] ); + M16( &non_zero_count[16+0*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] ) >> 8; + M16( &non_zero_count[16+1*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] ) >> 8; + M16( &non_zero_count[16+2*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] ) >> 8; + M16( &non_zero_count[16+3*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] ) >> 8; if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 ) h->mb.i_qp = h->mb.i_last_qp; @@ -1365,8 +1360,8 @@ void x264_macroblock_cache_save( x264_t *h ) h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]]; for( y = 0; y < 4; y++ ) { - *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+0]; - *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+2]; + CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[0][x264_scan8[0]+8*y+0] ); + CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[0][x264_scan8[0]+8*y+2] ); } if( h->sh.i_type == SLICE_TYPE_B ) { @@ -1376,8 +1371,8 @@ void x264_macroblock_cache_save( x264_t *h ) h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]]; for( y = 0; y < 4; y++ ) { - *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+0]; - *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+2]; + CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[1][x264_scan8[0]+8*y+0] ); + CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[1][x264_scan8[0]+8*y+2] ); } } } @@ -1386,12 +1381,12 @@ void x264_macroblock_cache_save( x264_t *h ) int i_list; for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2 : 1 ); i_list++ ) { - *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+0*s8x8] = (uint8_t)(-1) * 0x0101; - *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+1*s8x8] = (uint8_t)(-1) * 0x0101; + M16( &h->mb.ref[i_list][i_mb_8x8+0*s8x8] ) = (uint8_t)(-1) * 0x0101; + M16( &h->mb.ref[i_list][i_mb_8x8+1*s8x8] ) = (uint8_t)(-1) * 0x0101; for( y = 0; y < 4; y++ ) { - *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = 0; - *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] = 0; + M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] ) = 0; + M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] ) = 0; } } } @@ -1408,28 +1403,28 @@ void x264_macroblock_cache_save( x264_t *h ) { for( y = 0; y < 4; y++ ) { - *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[0][x264_scan8[0]+8*y+0]; - *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[0][x264_scan8[0]+8*y+2]; + CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[0][x264_scan8[0]+8*y+0] ); + CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[0][x264_scan8[0]+8*y+2] ); } if( h->sh.i_type == SLICE_TYPE_B ) for( y = 0; y < 4; y++ ) { - *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[1][x264_scan8[0]+8*y+0]; - *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[1][x264_scan8[0]+8*y+2]; + CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[1][x264_scan8[0]+8*y+0] ); + CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[1][x264_scan8[0]+8*y+2] ); } } else { for( y = 0; y < 4; y++ ) { - *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+0] = 0; - *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+2] = 0; + M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0] ) = 0; + M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2] ) = 0; } if( h->sh.i_type == SLICE_TYPE_B ) for( y = 0; y < 4; y++ ) { - *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+0] = 0; - *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+2] = 0; + M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0] ) = 0; + M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2] ) = 0; } } diff --git a/common/macroblock.h b/common/macroblock.h index 9529341..d6589bb 100644 --- a/common/macroblock.h +++ b/common/macroblock.h @@ -338,21 +338,22 @@ static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b ) } static ALWAYS_INLINE void x264_macroblock_cache_rect1( void *dst, int width, int height, uint8_t val ) { + uint32_t *d = dst; if( width == 4 ) { uint32_t val2 = val * 0x01010101; - ((uint32_t*)dst)[0] = val2; - if( height >= 2 ) ((uint32_t*)dst)[2] = val2; - if( height == 4 ) ((uint32_t*)dst)[4] = val2; - if( height == 4 ) ((uint32_t*)dst)[6] = val2; + M32( d+0 ) = val2; + if( height >= 2 ) M32( d+2 ) = val2; + if( height == 4 ) M32( d+4 ) = val2; + if( height == 4 ) M32( d+6 ) = val2; } else // 2 { uint32_t val2 = val * 0x0101; - ((uint16_t*)dst)[ 0] = val2; - if( height >= 2 ) ((uint16_t*)dst)[ 4] = val2; - if( height == 4 ) ((uint16_t*)dst)[ 8] = val2; - if( height == 4 ) ((uint16_t*)dst)[12] = val2; + M16( d+0 ) = val2; + if( height >= 2 ) M16( d+2 ) = val2; + if( height == 4 ) M16( d+4 ) = val2; + if( height == 4 ) M16( d+6 ) = val2; } } static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int height, uint32_t val ) @@ -360,25 +361,27 @@ static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int int dy; if( width == 1 || WORD_SIZE < 8 ) { + uint32_t *d = dst; for( dy = 0; dy < height; dy++ ) { - ((uint32_t*)dst)[8*dy+0] = val; - if( width >= 2 ) ((uint32_t*)dst)[8*dy+1] = val; - if( width == 4 ) ((uint32_t*)dst)[8*dy+2] = val; - if( width == 4 ) ((uint32_t*)dst)[8*dy+3] = val; + M32( d+8*dy+0 ) = val; + if( width >= 2 ) M32( d+8*dy+1 ) = val; + if( width == 4 ) M32( d+8*dy+2 ) = val; + if( width == 4 ) M32( d+8*dy+3 ) = val; } } else { uint64_t val64 = val + ((uint64_t)val<<32); + uint64_t *d = dst; for( dy = 0; dy < height; dy++ ) { - ((uint64_t*)dst)[4*dy+0] = val64; - if( width == 4 ) ((uint64_t*)dst)[4*dy+1] = val64; + M64( d+4*dy+0 ) = val64; + if( width == 4 ) M64( d+4*dy+1 ) = val64; } } } -#define x264_macroblock_cache_mv_ptr(a,x,y,w,h,l,mv) x264_macroblock_cache_mv(a,x,y,w,h,l,*(uint32_t*)mv) +#define x264_macroblock_cache_mv_ptr( a, x, y, w, h, l, mv ) x264_macroblock_cache_mv( a, x, y, w, h, l, M32( mv ) ) static ALWAYS_INLINE void x264_macroblock_cache_mv( x264_t *h, int x, int y, int width, int height, int i_list, uint32_t mv ) { x264_macroblock_cache_rect4( &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y], width, height, mv ); @@ -462,7 +465,7 @@ static inline int x264_mb_transform_8x8_allowed( x264_t *h ) return 0; if( h->mb.i_type != P_8x8 ) return partition_tab[h->mb.i_type]; - return *(uint32_t*)h->mb.i_sub_partition == D_L0_8x8*0x01010101; + return M32( h->mb.i_sub_partition ) == D_L0_8x8*0x01010101; } #endif diff --git a/common/predict.c b/common/predict.c index 00c4648..92e0992 100644 --- a/common/predict.c +++ b/common/predict.c @@ -44,11 +44,10 @@ #define PREDICT_16x16_DC(v) \ for( i = 0; i < 16; i++ )\ {\ - uint32_t *p = (uint32_t*)src;\ - *p++ = v;\ - *p++ = v;\ - *p++ = v;\ - *p++ = v;\ + M32( src+ 0 ) = v;\ + M32( src+ 4 ) = v;\ + M32( src+ 8 ) = v;\ + M32( src+12 ) = v;\ src += FDEC_STRIDE;\ } @@ -104,32 +103,28 @@ static void predict_16x16_h( uint8_t *src ) for( i = 0; i < 16; i++ ) { const uint32_t v = 0x01010101 * src[-1]; - uint32_t *p = (uint32_t*)src; - - *p++ = v; - *p++ = v; - *p++ = v; - *p++ = v; - + M32( src+ 0 ) = v; + M32( src+ 4 ) = v; + M32( src+ 8 ) = v; + M32( src+12 ) = v; src += FDEC_STRIDE; } } static void predict_16x16_v( uint8_t *src ) { - uint32_t v0 = *(uint32_t*)&src[ 0-FDEC_STRIDE]; - uint32_t v1 = *(uint32_t*)&src[ 4-FDEC_STRIDE]; - uint32_t v2 = *(uint32_t*)&src[ 8-FDEC_STRIDE]; - uint32_t v3 = *(uint32_t*)&src[12-FDEC_STRIDE]; + uint32_t v0 = M32( &src[ 0-FDEC_STRIDE] ); + uint32_t v1 = M32( &src[ 4-FDEC_STRIDE] ); + uint32_t v2 = M32( &src[ 8-FDEC_STRIDE] ); + uint32_t v3 = M32( &src[12-FDEC_STRIDE] ); int i; for( i = 0; i < 16; i++ ) { - uint32_t *p = (uint32_t*)src; - *p++ = v0; - *p++ = v1; - *p++ = v2; - *p++ = v3; + M32( src+ 0 ) = v0; + M32( src+ 4 ) = v1; + M32( src+ 8 ) = v2; + M32( src+12 ) = v3; src += FDEC_STRIDE; } } @@ -178,9 +173,8 @@ static void predict_8x8c_dc_128( uint8_t *src ) for( y = 0; y < 8; y++ ) { - uint32_t *p = (uint32_t*)src; - *p++ = 0x80808080; - *p++ = 0x80808080; + M32( src+0 ) = 0x80808080; + M32( src+4 ) = 0x80808080; src += FDEC_STRIDE; } } @@ -199,16 +193,14 @@ static void predict_8x8c_dc_left( uint8_t *src ) for( y = 0; y < 4; y++ ) { - uint32_t *p = (uint32_t*)src; - *p++ = dc0; - *p++ = dc0; + M32( src+0 ) = dc0; + M32( src+4 ) = dc0; src += FDEC_STRIDE; } for( y = 0; y < 4; y++ ) { - uint32_t *p = (uint32_t*)src; - *p++ = dc1; - *p++ = dc1; + M32( src+0 ) = dc1; + M32( src+4 ) = dc1; src += FDEC_STRIDE; } @@ -228,9 +220,8 @@ static void predict_8x8c_dc_top( uint8_t *src ) for( y = 0; y < 8; y++ ) { - uint32_t *p = (uint32_t*)src; - *p++ = dc0; - *p++ = dc1; + M32( src+0 ) = dc0; + M32( src+4 ) = dc1; src += FDEC_STRIDE; } } @@ -264,17 +255,15 @@ static void predict_8x8c_dc( uint8_t *src ) for( y = 0; y < 4; y++ ) { - uint32_t *p = (uint32_t*)src; - *p++ = dc0; - *p++ = dc1; + M32( src+0 ) = dc0; + M32( src+4 ) = dc1; src += FDEC_STRIDE; } for( y = 0; y < 4; y++ ) { - uint32_t *p = (uint32_t*)src; - *p++ = dc2; - *p++ = dc3; + M32( src+0 ) = dc2; + M32( src+4 ) = dc3; src += FDEC_STRIDE; } } @@ -285,23 +274,21 @@ static void predict_8x8c_h( uint8_t *src ) for( i = 0; i < 8; i++ ) { uint32_t v = 0x01010101 * src[-1]; - uint32_t *p = (uint32_t*)src; - *p++ = v; - *p++ = v; + M32( src+0 ) = v; + M32( src+4 ) = v; src += FDEC_STRIDE; } } static void predict_8x8c_v( uint8_t *src ) { - uint32_t v0 = *(uint32_t*)&src[0-FDEC_STRIDE]; - uint32_t v1 = *(uint32_t*)&src[4-FDEC_STRIDE]; + uint32_t v0 = M32( src+0-FDEC_STRIDE ); + uint32_t v1 = M32( src+4-FDEC_STRIDE ); int i; for( i = 0; i < 8; i++ ) { - uint32_t *p = (uint32_t*)src; - *p++ = v0; - *p++ = v1; + M32( src+0 ) = v0; + M32( src+4 ) = v1; src += FDEC_STRIDE; } } @@ -343,10 +330,12 @@ static void predict_8x8c_p( uint8_t *src ) ****************************************************************************/ #define SRC(x,y) src[(x)+(y)*FDEC_STRIDE] -#define SRC32(x,y) *(uint32_t*)&SRC(x,y) #define PREDICT_4x4_DC(v)\ - SRC32(0,0) = SRC32(0,1) = SRC32(0,2) = SRC32(0,3) = v; + M32( &SRC(0,0) ) = v;\ + M32( &SRC(0,1) ) = v;\ + M32( &SRC(0,2) ) = v;\ + M32( &SRC(0,3) ) = v;\ static void predict_4x4_dc_128( uint8_t *src ) { @@ -370,14 +359,14 @@ static void predict_4x4_dc( uint8_t *src ) } static void predict_4x4_h( uint8_t *src ) { - SRC32(0,0) = SRC(-1,0) * 0x01010101; - SRC32(0,1) = SRC(-1,1) * 0x01010101; - SRC32(0,2) = SRC(-1,2) * 0x01010101; - SRC32(0,3) = SRC(-1,3) * 0x01010101; + M32( &SRC(0,0) ) = SRC(-1,0) * 0x01010101; + M32( &SRC(0,1) ) = SRC(-1,1) * 0x01010101; + M32( &SRC(0,2) ) = SRC(-1,2) * 0x01010101; + M32( &SRC(0,3) ) = SRC(-1,3) * 0x01010101; } static void predict_4x4_v( uint8_t *src ) { - PREDICT_4x4_DC(SRC32(0,-1)); + PREDICT_4x4_DC(M32( &SRC(0,-1)) ); } #define PREDICT_4x4_LOAD_LEFT\ @@ -535,7 +524,7 @@ static void predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, } else { - *(uint64_t*)(edge+24) = SRC(7,-1) * 0x0101010101010101ULL; + M64( edge+24 ) = SRC(7,-1) * 0x0101010101010101ULL; edge[32] = SRC(7,-1); } } @@ -561,8 +550,8 @@ static void predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, #define PREDICT_8x8_DC(v) \ int y; \ for( y = 0; y < 8; y++ ) { \ - ((uint32_t*)src)[0] = \ - ((uint32_t*)src)[1] = v; \ + M32( src+0 ) = v; \ + M32( src+4 ) = v; \ src += FDEC_STRIDE; \ } @@ -593,17 +582,17 @@ static void predict_8x8_dc( uint8_t *src, uint8_t edge[33] ) static void predict_8x8_h( uint8_t *src, uint8_t edge[33] ) { PREDICT_8x8_LOAD_LEFT -#define ROW(y) ((uint32_t*)(src+y*FDEC_STRIDE))[0] =\ - ((uint32_t*)(src+y*FDEC_STRIDE))[1] = 0x01010101U * l##y +#define ROW(y) M32( src+y*FDEC_STRIDE+0 ) = 0x01010101U * l##y;\ + M32( src+y*FDEC_STRIDE+4 ) = 0x01010101U * l##y; ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7); #undef ROW } static void predict_8x8_v( uint8_t *src, uint8_t edge[33] ) { - const uint64_t top = *(uint64_t*)(edge+16); + const uint64_t top = M64( edge+16 ); int y; for( y = 0; y < 8; y++ ) - *(uint64_t*)(src+y*FDEC_STRIDE) = top; + M64( src+y*FDEC_STRIDE ) = top; } static void predict_8x8_ddl( uint8_t *src, uint8_t edge[33] ) { @@ -680,27 +669,28 @@ static void predict_8x8_hd( uint8_t *src, uint8_t edge[33] ) PREDICT_8x8_LOAD_TOP PREDICT_8x8_LOAD_LEFT PREDICT_8x8_LOAD_TOPLEFT - int p1 = pack8to16(F1(l6,l7), F2(l5,l6,l7)); - int p2 = pack8to16(F1(l5,l6), F2(l4,l5,l6)); - int p3 = pack8to16(F1(l4,l5), F2(l3,l4,l5)); - int p4 = pack8to16(F1(l3,l4), F2(l2,l3,l4)); - int p5 = pack8to16(F1(l2,l3), F2(l1,l2,l3)); - int p6 = pack8to16(F1(l1,l2), F2(l0,l1,l2)); - int p7 = pack8to16(F1(l0,l1), F2(lt,l0,l1)); - int p8 = pack8to16(F1(lt,l0), F2(l0,lt,t0)); - int p9 = pack8to16(F2(t1,t0,lt), F2(t2,t1,t0)); - int p10 = pack8to16(F2(t3,t2,t1), F2(t4,t3,t2)); - int p11 = pack8to16(F2(t5,t4,t3), F2(t6,t5,t4)); - SRC32(0,7)= pack16to32(p1,p2); - SRC32(0,6)= pack16to32(p2,p3); - SRC32(4,7)=SRC32(0,5)= pack16to32(p3,p4); - SRC32(4,6)=SRC32(0,4)= pack16to32(p4,p5); - SRC32(4,5)=SRC32(0,3)= pack16to32(p5,p6); - SRC32(4,4)=SRC32(0,2)= pack16to32(p6,p7); - SRC32(4,3)=SRC32(0,1)= pack16to32(p7,p8); - SRC32(4,2)=SRC32(0,0)= pack16to32(p8,p9); - SRC32(4,1)= pack16to32(p9,p10); - SRC32(4,0)= pack16to32(p10,p11); + SRC(0,7)= (l6 + l7 + 1) >> 1; + SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2; + SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1; + SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2; + SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1; + SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2; + SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1; + SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2; + SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1; + SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2; + SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1; + SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2; + SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1; + SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2; + SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1; + SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2; + SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2; + SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2; + SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2; + SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2; + SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2; + SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2; } static void predict_8x8_vl( uint8_t *src, uint8_t edge[33] ) { @@ -732,22 +722,24 @@ static void predict_8x8_vl( uint8_t *src, uint8_t edge[33] ) static void predict_8x8_hu( uint8_t *src, uint8_t edge[33] ) { PREDICT_8x8_LOAD_LEFT - int p1 = pack8to16(F1(l0,l1), F2(l0,l1,l2)); - int p2 = pack8to16(F1(l1,l2), F2(l1,l2,l3)); - int p3 = pack8to16(F1(l2,l3), F2(l2,l3,l4)); - int p4 = pack8to16(F1(l3,l4), F2(l3,l4,l5)); - int p5 = pack8to16(F1(l4,l5), F2(l4,l5,l6)); - int p6 = pack8to16(F1(l5,l6), F2(l5,l6,l7)); - int p7 = pack8to16(F1(l6,l7), F2(l6,l7,l7)); - int p8 = pack8to16(l7,l7); - SRC32(0,0)= pack16to32(p1,p2); - SRC32(0,1)= pack16to32(p2,p3); - SRC32(4,0)=SRC32(0,2)= pack16to32(p3,p4); - SRC32(4,1)=SRC32(0,3)= pack16to32(p4,p5); - SRC32(4,2)=SRC32(0,4)= pack16to32(p5,p6); - SRC32(4,3)=SRC32(0,5)= pack16to32(p6,p7); - SRC32(4,4)=SRC32(0,6)= pack16to32(p7,p8); - SRC32(4,5)=SRC32(4,6)= SRC32(0,7) = SRC32(4,7) = pack16to32(p8,p8); + SRC(0,0)= (l0 + l1 + 1) >> 1; + SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2; + SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1; + SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2; + SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1; + SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2; + SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1; + SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2; + SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1; + SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2; + SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1; + SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2; + SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1; + SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2; + SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)= + SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)= + SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)= + SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7; } /**************************************************************************** diff --git a/common/quant.c b/common/quant.c index 096a4b3..7434a3d 100644 --- a/common/quant.c +++ b/common/quant.c @@ -178,7 +178,7 @@ static int ALWAYS_INLINE x264_decimate_score_internal( int16_t *dct, int i_max ) int idx = i_max - 1; /* Yes, dct[idx-1] is guaranteed to be 32-bit aligned. idx>=0 instead of 1 works correctly for the same reason */ - while( idx >= 0 && *(uint32_t*)&dct[idx-1] == 0 ) + while( idx >= 0 && M32( &dct[idx-1] ) == 0 ) idx -= 2; if( idx >= 0 && dct[idx] == 0 ) idx--; @@ -218,7 +218,7 @@ static int ALWAYS_INLINE x264_coeff_last_internal( int16_t *l, int i_count ) { int i_last; for( i_last = i_count-1; i_last >= 3; i_last -= 4 ) - if( *(uint64_t*)(l+i_last-3) ) + if( M64( l+i_last-3 ) ) break; while( i_last >= 0 && l[i_last] == 0 ) i_last--; diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c index 5cfa6fd..602ddcd 100644 --- a/common/x86/predict-c.c +++ b/common/x86/predict-c.c @@ -266,12 +266,12 @@ static void predict_8x8c_dc_left( uint8_t *src ) for( y = 0; y < 4; y++ ) { - *(uint64_t*)src = dc0; + M64( src ) = dc0; src += FDEC_STRIDE; } for( y = 0; y < 4; y++ ) { - *(uint64_t*)src = dc1; + M64( src ) = dc1; src += FDEC_STRIDE; } @@ -296,8 +296,8 @@ static void predict_8x8c_dc_left( uint8_t *src ) #define PREDICT_8x8_DC(v) \ int y; \ for( y = 0; y < 8; y++ ) { \ - ((uint32_t*)src)[0] = \ - ((uint32_t*)src)[1] = v; \ + M32( src+0 ) = v; \ + M32( src+4 ) = v; \ src += FDEC_STRIDE; \ } diff --git a/common/x86/util.h b/common/x86/util.h index 5ace375..efc700a 100644 --- a/common/x86/util.h +++ b/common/x86/util.h @@ -38,8 +38,8 @@ static inline void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16_t *b, "pminsw %%mm2, %%mm0 \n" "pmaxsw %%mm1, %%mm0 \n" "movd %%mm0, %0 \n" - :"=m"(*(uint32_t*)dst) - :"m"(*(uint32_t*)a), "m"(*(uint32_t*)b), "m"(*(uint32_t*)c) + :"=m"(*(x264_union32_t*)dst) + :"m"(M32( a )), "m"(M32( b )), "m"(M32( c )) ); } #define x264_predictor_difference x264_predictor_difference_mmxext @@ -69,7 +69,7 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t "jg 1b \n" "movq %%mm4, %0 \n" :"=m"(output), "+r"(i_mvc) - :"r"(mvc), "m"(*(struct {int16_t x[4];} *)mvc) + :"r"(mvc), "m"(M64( mvc )) ); sum += output[0] + output[1] + output[2] + output[3]; return sum; @@ -98,7 +98,7 @@ static ALWAYS_INLINE uint32_t x264_cabac_amvd_sum_mmxext(int16_t *mvdleft, int16 "pminsw %5, %%mm0 \n" "movd %%mm0, %0 \n" :"=r"(amvd) - :"m"(*(uint32_t*)mvdleft),"m"(*(uint32_t*)mvdtop), + :"m"(M32( mvdleft )),"m"(M32( mvdtop )), "m"(pw_28),"m"(pw_2184),"m"(pw_2) ); return amvd; diff --git a/encoder/analyse.c b/encoder/analyse.c index 6e10202..48499e1 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -874,10 +874,10 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ if( h->mb.i_skip_intra ) { h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 ); - h->mb.pic.i8x8_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]]; - h->mb.pic.i8x8_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]]; - h->mb.pic.i8x8_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]]; - h->mb.pic.i8x8_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]]; + h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ); + h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ); + h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ); + h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ); h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma; if( h->mb.i_skip_intra == 2 ) h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) ); @@ -918,7 +918,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP ) /* emulate missing topright samples */ - *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U; + M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U; if( b_merged_satd && i_max >= 6 ) { @@ -964,10 +964,10 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ if( h->mb.i_skip_intra ) { h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 ); - h->mb.pic.i4x4_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]]; - h->mb.pic.i4x4_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]]; - h->mb.pic.i4x4_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]]; - h->mb.pic.i4x4_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]]; + h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ); + h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ); + h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ); + h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ); h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma; if( h->mb.i_skip_intra == 2 ) h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) ); @@ -1092,7 +1092,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP ) /* emulate missing topright samples */ - *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U; + M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U; for( i = 0; i < i_max; i++ ) { @@ -1107,18 +1107,18 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) { a->i_predict4x4[idx] = i_mode; i_best = i_satd; - pels[0] = *(uint32_t*)(p_dst_by+0*FDEC_STRIDE); - pels[1] = *(uint32_t*)(p_dst_by+1*FDEC_STRIDE); - pels[2] = *(uint32_t*)(p_dst_by+2*FDEC_STRIDE); - pels[3] = *(uint32_t*)(p_dst_by+3*FDEC_STRIDE); + pels[0] = M32( p_dst_by+0*FDEC_STRIDE ); + pels[1] = M32( p_dst_by+1*FDEC_STRIDE ); + pels[2] = M32( p_dst_by+2*FDEC_STRIDE ); + pels[3] = M32( p_dst_by+3*FDEC_STRIDE ); i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]]; } } - *(uint32_t*)(p_dst_by+0*FDEC_STRIDE) = pels[0]; - *(uint32_t*)(p_dst_by+1*FDEC_STRIDE) = pels[1]; - *(uint32_t*)(p_dst_by+2*FDEC_STRIDE) = pels[2]; - *(uint32_t*)(p_dst_by+3*FDEC_STRIDE) = pels[3]; + M32( p_dst_by+0*FDEC_STRIDE ) = pels[0]; + M32( p_dst_by+1*FDEC_STRIDE ) = pels[1]; + M32( p_dst_by+2*FDEC_STRIDE ) = pels[2]; + M32( p_dst_by+3*FDEC_STRIDE ) = pels[3]; h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz; h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx]; @@ -1163,21 +1163,21 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) cbp_luma_new = h->mb.i_cbp_luma; i_best = i_satd; - pels_h = *(uint64_t*)(p_dst_by+7*FDEC_STRIDE); + pels_h = M64( p_dst_by+7*FDEC_STRIDE ); if( !(idx&1) ) for( j=0; j<7; j++ ) pels_v[j] = p_dst_by[7+j*FDEC_STRIDE]; - i_nnz[0] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]]; - i_nnz[1] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]]; + i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ); + i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ); } } a->i_cbp_i8x8_luma = cbp_luma_new; - *(uint64_t*)(p_dst_by+7*FDEC_STRIDE) = pels_h; + M64( p_dst_by+7*FDEC_STRIDE ) = pels_h; if( !(idx&1) ) for( j=0; j<7; j++ ) p_dst_by[7+j*FDEC_STRIDE] = pels_v[j]; - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] = i_nnz[0]; - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] = i_nnz[1]; + M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0]; + M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1]; x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] ); } @@ -1259,8 +1259,8 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) ); /* save mv for predicting neighbors */ - *(uint32_t*)a->l0.mvc[i_ref][0] = - *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv; + CP32( a->l0.mvc[i_ref][0], m.mv ); + CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv ); } x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref ); @@ -1270,7 +1270,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) if( a->i_mbrd ) { x264_mb_cache_fenc_satd( h ); - if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv ) + if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) ) { h->mb.i_partition = D_16x16; x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); @@ -1308,7 +1308,7 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t } for( i_ref = 0; i_ref <= i_maxref; i_ref++ ) - *(uint32_t*)a->l0.mvc[i_ref][0] = *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy]; + CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] ); for( i = 0; i < 4; i++ ) { @@ -1335,7 +1335,7 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t m.cost += i_ref_cost; i_halfpel_thresh += i_ref_cost; - *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv; + CP32( a->l0.mvc[i_ref][i+1], m.mv ); if( m.cost < l0m->cost ) h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) ); @@ -1372,7 +1372,7 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a ) h->mb.i_partition = D_8x8; i_mvc = 1; - *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.me16x16.mv; + CP32( mvc[0], a->l0.me16x16.mv ); for( i = 0; i < 4; i++ ) { @@ -1392,7 +1392,7 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a ) x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv ); - *(uint32_t*)mvc[i_mvc] = *(uint32_t*)m->mv; + CP32( mvc[i_mvc], m->mv ); i_mvc++; /* mb type cost */ @@ -1438,9 +1438,9 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a ) m.i_ref_cost = i_ref_cost; /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */ - *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0]; - *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1]; - *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2]; + CP32( mvc[0], a->l0.mvc[i_ref][0] ); + CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] ); + CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] ); LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i ); LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i ); @@ -1487,9 +1487,9 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a ) const int i_ref_cost = REF_COST( 0, i_ref ); m.i_ref_cost = i_ref_cost; - *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0]; - *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1]; - *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3]; + CP32( mvc[0], a->l0.mvc[i_ref][0] ); + CP32( mvc[1], a->l0.mvc[i_ref][i+1] ); + CP32( mvc[2], a->l0.mvc[i_ref][i+3] ); LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 ); LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 ); @@ -1731,7 +1731,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) } /* save mv for predicting neighbors */ - *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv; + CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv ); } a->l0.me16x16.i_ref = a->l0.i_ref; @@ -1760,7 +1760,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) } /* save mv for predicting neighbors */ - *(uint32_t*)h->mb.mvr[1][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv; + CP32( h->mb.mvr[1][i_ref][h->mb.i_mb_xy], m.mv ); } a->l1.me16x16.i_ref = a->l1.i_ref; @@ -1972,8 +1972,8 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a ) LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i ); LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i ); - *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[2*i].mv; - *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[2*i+1].mv; + CP32( mvc[0], lX->me8x8[2*i].mv ); + CP32( mvc[1], lX->me8x8[2*i+1].mv ); x264_mb_predict_mv( h, l, 8*i, 2, m->mvp ); x264_me_search( h, m, mvc, 2 ); @@ -2040,8 +2040,8 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a ) LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 ); LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 ); - *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[i].mv; - *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[i+2].mv; + CP32( mvc[0], lX->me8x8[i].mv ); + CP32( mvc[1], lX->me8x8[i+2].mv ); x264_mb_predict_mv( h, l, 4*i, 2, m->mvp ); x264_me_search( h, m, mvc, 2 ); @@ -2995,7 +2995,7 @@ void x264_macroblock_analyse( x264_t *h ) static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2}; int list = check_mv_lists[h->mb.i_type] - 1; if( list >= 0 && h->mb.i_partition != D_16x16 && - *(uint32_t*)&h->mb.cache.mv[list][x264_scan8[0]] == *(uint32_t*)&h->mb.cache.mv[list][x264_scan8[12]] && + M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) && h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] ) h->mb.i_partition = D_16x16; } diff --git a/encoder/macroblock.c b/encoder/macroblock.c index 4338e47..4abbc2e 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -155,8 +155,16 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp ) #define STORE_8x8_NNZ(idx,nz)\ {\ - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] = nz * 0x0101;\ - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] = nz * 0x0101;\ + M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] ) = nz * 0x0101;\ + M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] ) = nz * 0x0101;\ +} + +#define CLEAR_16x16_NNZ \ +{\ + M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;\ + M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = 0;\ + M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = 0;\ + M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = 0;\ } void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp ) @@ -244,10 +252,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp ) if( decimate_score < 6 ) { h->mb.i_cbp_luma = 0; - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0; - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0; - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0; - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0; + CLEAR_16x16_NNZ } h->dctf.dct4x4dc( dct_dc4x4 ); @@ -661,10 +666,10 @@ void x264_macroblock_encode( x264_t *h ) if( h->mb.i_skip_intra ) { h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 ); - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i8x8_nnz_buf[0]; - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i8x8_nnz_buf[1]; - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i8x8_nnz_buf[2]; - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i8x8_nnz_buf[3]; + M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i8x8_nnz_buf[0]; + M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i8x8_nnz_buf[1]; + M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i8x8_nnz_buf[2]; + M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i8x8_nnz_buf[3]; h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp; /* In RD mode, restore the now-overwritten DCT data. */ if( h->mb.i_skip_intra == 2 ) @@ -691,10 +696,10 @@ void x264_macroblock_encode( x264_t *h ) if( h->mb.i_skip_intra ) { h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 ); - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i4x4_nnz_buf[0]; - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i4x4_nnz_buf[1]; - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i4x4_nnz_buf[2]; - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i4x4_nnz_buf[3]; + M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i4x4_nnz_buf[0]; + M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i4x4_nnz_buf[1]; + M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i4x4_nnz_buf[2]; + M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i4x4_nnz_buf[3]; h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp; /* In RD mode, restore the now-overwritten DCT data. */ if( h->mb.i_skip_intra == 2 ) @@ -707,7 +712,7 @@ void x264_macroblock_encode( x264_t *h ) if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP ) /* emulate missing topright samples */ - *(uint32_t*) &p_dst[4-FDEC_STRIDE] = p_dst[3-FDEC_STRIDE] * 0x01010101U; + M32( &p_dst[4-FDEC_STRIDE] ) = p_dst[3-FDEC_STRIDE] * 0x01010101U; if( h->mb.b_lossless ) x264_predict_lossless_4x4( h, p_dst, i, i_mode ); @@ -779,10 +784,7 @@ void x264_macroblock_encode( x264_t *h ) if( i_decimate_mb < 6 && b_decimate ) { h->mb.i_cbp_luma = 0; - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0; - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0; - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0; - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0; + CLEAR_16x16_NNZ } else { @@ -851,10 +853,7 @@ void x264_macroblock_encode( x264_t *h ) if( i_decimate_mb < 6 ) { h->mb.i_cbp_luma = 0; - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0; - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0; - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0; - *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0; + CLEAR_16x16_NNZ } else { @@ -899,7 +898,7 @@ void x264_macroblock_encode( x264_t *h ) { if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) && - *(uint32_t*)h->mb.cache.mv[0][x264_scan8[0]] == *(uint32_t*)h->mb.cache.pskip_mv + M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv ) && h->mb.cache.ref[0][x264_scan8[0]] == 0 ) { h->mb.i_type = P_SKIP; diff --git a/encoder/me.c b/encoder/me.c index 0f1d96f..707f68c 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -211,7 +211,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, COST_MV_HPEL( bmx, bmy ); for( i = 0; i < i_mvc; i++ ) { - if( *(uint32_t*)mvc[i] && (bmv - *(uint32_t*)mvc[i]) ) + if( M32( mvc[i] ) && (bmv - M32( mvc[i] )) ) { int mx = x264_clip3( mvc[i][0], mv_x_min*4, mv_x_max*4 ); int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 ); @@ -643,7 +643,7 @@ me_hex2: { /* mvsad_t is not guaranteed to be 8 bytes on all archs, so check before using explicit write-combining */ if( sizeof( mvsad_t ) == sizeof( uint64_t ) ) - *(uint64_t*)&mvsads[i] = *(uint64_t*)&mvsads[j]; + CP64( &mvsads[i], &mvsads[j] ); else mvsads[i] = mvsads[j]; i += mvsads[j].sad <= sad_thresh; @@ -659,7 +659,7 @@ me_hex2: nmvsad--; mvsads[bi] = mvsads[nmvsad]; if( sizeof( mvsad_t ) == sizeof( uint64_t ) ) - *(uint64_t*)&mvsads[bi] = *(uint64_t*)&mvsads[nmvsad]; + CP64( &mvsads[bi], &mvsads[nmvsad] ); else mvsads[bi] = mvsads[nmvsad]; } @@ -974,8 +974,10 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m if( cost < bcost * SATD_THRESH ) { bcost = X264_MIN( cost, bcost ); - *(uint32_t*)cache0_mv = *(uint32_t*)cache0_mv2 = pack16to32_mask(m0x,m0y); - *(uint32_t*)cache1_mv = *(uint32_t*)cache1_mv2 = pack16to32_mask(m1x,m1y); + M32( cache0_mv ) = pack16to32_mask(m0x,m0y); + M32( cache0_mv2 ) = pack16to32_mask(m0x,m0y); + M32( cache1_mv ) = pack16to32_mask(m1x,m1y); + M32( cache1_mv2 ) = pack16to32_mask(m1x,m1y); h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight ); h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight ); uint64_t costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel ); @@ -1038,7 +1040,8 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei if( satd <= bsatd * SATD_THRESH ) \ { \ uint64_t cost; \ - *(uint32_t*)cache_mv = *(uint32_t*)cache_mv2 = pack16to32_mask(mx,my); \ + M32( cache_mv ) = pack16to32_mask(mx,my); \ + M32( cache_mv2 ) = pack16to32_mask(mx,my); \ cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \ } \ diff --git a/encoder/slicetype.c b/encoder/slicetype.c index e55494b..4cc9c4f 100644 --- a/encoder/slicetype.c +++ b/encoder/slicetype.c @@ -373,10 +373,10 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, ALIGNED_4( int16_t mvc[4][2] ); /* Reverse-order MV prediction. */ - *(uint32_t*)mvc[0] = 0; - *(uint32_t*)mvc[1] = 0; - *(uint32_t*)mvc[2] = 0; -#define MVC(mv) { *(uint32_t*)mvc[i_mvc] = *(uint32_t*)mv; i_mvc++; } + M32( mvc[0] ) = 0; + M32( mvc[1] ) = 0; + M32( mvc[2] ) = 0; +#define MVC(mv) { CP32( mvc[i_mvc], mv ); i_mvc++; } if( i_mb_x < h->sps->i_mb_width - 1 ) MVC(fenc_mv[1]); if( i_mb_y < h->sps->i_mb_height - 1 ) @@ -392,20 +392,20 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, x264_me_search( h, &m[l], mvc, i_mvc ); m[l].cost -= 2; // remove mvcost from skip mbs - if( *(uint32_t*)m[l].mv ) + if( M32( m[l].mv ) ) m[l].cost += 5; - *(uint32_t*)fenc_mvs[l] = *(uint32_t*)m[l].mv; + CP32( fenc_mvs[l], m[l].mv ); *fenc_costs[l] = m[l].cost; } else { - *(uint32_t*)m[l].mv = *(uint32_t*)fenc_mvs[l]; + CP32( m[l].mv, fenc_mvs[l] ); m[l].cost = *fenc_costs[l]; } COPY2_IF_LT( i_bcost, m[l].cost, list_used, l+1 ); } - if( b_bidir && ( *(uint32_t*)m[0].mv || *(uint32_t*)m[1].mv ) ) + if( b_bidir && ( M32( m[0].mv ) || M32( m[1].mv ) ) ) TRY_BIDIR( m[0].mv, m[1].mv, 5 ); /* Store to width-2 bitfield. */