/* Copyright (C) 2007 Loren Merritt
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 * 
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 * 
 * 3. The name of the author may not be used to endorse or promote products
 * derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <inttypes.h>
#include <math.h>
#include <stdio.h>
#include <string.h>

#include "dcts_constants.h"
#include "dcts-a.h"

#define SHIFT(x) (((x)+(1<<15))>>16)

// 1/(cos(pi*i/16)*sqrt(2))
static const float aan_scale[8] = {
    1.00000000000000000000,
    0.72095982200694791383,
    0.76536686473017954350,
    0.85043009476725644878,
    1.00000000000000000000,
    1.27275858057283393842,
    1.84775906502257351242,
    3.62450978541155137218,
};

void dump_int(int *block, int n, int transpose, int mul) {
    int i;
    int b = transpose;
    for(i=0; i<n; i++) {
        int j = transpose ? ((i&((1<<b)-1))<<b)|(i>>b) : i;
        int v = mul ? SHIFT(block[j]*mul) : block[j];
        printf("%5d ", v);
//      if((i&7)==7 && n >= 32) printf("\n");
    }
    printf("\n");
}

void dump_int16(int16_t *block, int n, int transpose, int mul) {
    int i;
    int b = transpose;
    for(i=0; i<n; i++) {
        int j = transpose ? ((i&((1<<b)-1))<<b)|(i>>b) : i;
        int v = mul ? SHIFT(block[j]*mul) : block[j];
        printf("%5d ", v);
//      if((i&7)==7 && n >= 32) printf("\n");
    }
    printf("\n");
}

void dump_float(float *block, int n, int transpose, float mul) {
    int i;
    int b = transpose;
    for(i=0; i<n; i++) {
        int j = transpose ? ((i&((1<<b)-1))<<b)|(i>>b) : i;
        float v = mul ? block[j]*mul : block[j];
        printf("%7.4f ", v);
    }
    printf("\n");
}

void div_int(int *block, int n, int log) {
    int i;
    int add = log<2 ? 0 : (1<<(log-1))-1; // because the dct already added part
    for(i=0; i<n; i++)
        block[i] = (block[i] + add) >> log;
}

void div_int16(int16_t *block, int n, int log) {
    int i;
    int add = log<2 ? 0 : (1<<(log-1))-1; // because the dct already added part
    for(i=0; i<n; i++)
        block[i] = (block[i] + add) >> log;
}

void div_float(float *block, int n, int log) {
    int i;
    float m = 1.0/(1<<log);
    for(i=0; i<n; i++)
        block[i] *= m;
}

const int sample_int = 1000;
const float sample_float = 1.0;

#define SRC(x) block[x*stride]

#define LOAD4\
    dctelem a0 = SRC(0);\
    dctelem a1 = SRC(1);\
    dctelem a2 = SRC(2);\
    dctelem a3 = SRC(3);\

#define LOAD8\
    LOAD4\
    dctelem a4 = SRC(4);\
    dctelem a5 = SRC(5);\
    dctelem a6 = SRC(6);\
    dctelem a7 = SRC(7);\

#define LOAD16\
    LOAD8\
    dctelem  a8 = SRC( 8);\
    dctelem  a9 = SRC( 9);\
    dctelem a10 = SRC(10);\
    dctelem a11 = SRC(11);\
    dctelem a12 = SRC(12);\
    dctelem a13 = SRC(13);\
    dctelem a14 = SRC(14);\
    dctelem a15 = SRC(15);\

#define XCHG(x,y) {\
    typeof(x) t = x;\
    x = y;\
    y = t;\
}

#if 0
#define ROTATE(x,y,a,b) {\
    typeof(x) x1 = SHIFT(a*x + b*y);\
    typeof(x) y1 = SHIFT(b*x - a*y);\
    x = x1;\
    y = y1;\
}
#elif 1
#define ROTATE(x,y,a,b) {\
    typeof(x) t = (b)*(x+y);\
    x = SHIFT(t+((a)-(b))*x);\
    y = SHIFT(t-((a)+(b))*y);\
}
#else
#define ROTATE(x,y,a,b) {\
    x += SHIFT((b##m1##a)*y);\
    y += SHIFT(a*x);\
    x += SHIFT((b##m1##a)*y);\
    XCHG(x,y);\
}
#endif

#define ROTATE_MID(x,y,a) {\
    BUTTERFLY(x,y);\
    x = SHIFT(x*a);\
    y = SHIFT(y*a);\
}

#define BUTTERFLY(x,y) {\
    typeof(x) t = x;\
    x = x + y;\
    y = t - y;\
}

#define BUTTERFLY_LIFT0(x,y) {\
    y = x - y;\
    x -= HALF(y);\
}
#define BUTTERFLY_LIFT1(x,y) {\
    x += HALF(y);\
    y = x - y;\
}

#define MUL(x,y) \
    SHIFT((x)*(y))

#define MATRIX4(d0,d1,d2,d3)\
    SHIFT(d0*a0 d1*a1 d2*a2 d3*a3)

#define DCT4(name) \
void fdct4_matrix_##name(dctelem *block, int stride) {\
    LOAD4;\
    SRC(0) = SHIFT(c2*(a0 + a1 + a2 + a3));\
    SRC(1) = MATRIX4(+c1, +c3, -c3, -c1);\
    SRC(2) = SHIFT(c2*(a0 - a1 - a2 + a3));\
    SRC(3) = MATRIX4(+c3, -c1, +c1, -c3);\
}\
void idct4_matrix_##name(dctelem *block, int stride) {\
    LOAD4;\
    SRC(0) = MATRIX4(+c2, +c1, +c2, +c3);\
    SRC(1) = MATRIX4(+c2, +c3, -c2, -c1);\
    SRC(2) = MATRIX4(+c2, -c3, -c2, +c1);\
    SRC(3) = MATRIX4(+c2, -c1, +c2, -c3);\
}\
void fdct4_chen_##name(dctelem *block, int stride) {\
    LOAD4;\
    BUTTERFLY(a0,a3);\
    BUTTERFLY(a1,a2);\
    ROTATE_MID(a0,a1,c2);\
    ROTATE(a3,a2,c1,c3);\
    SRC(0) = a0;\
    SRC(1) = a3;\
    SRC(2) = a1;\
    SRC(3) = a2;\
}\
void idct4_chen_##name(dctelem *block, int stride) {\
    LOAD4;\
    ROTATE_MID(a0,a2,c2);\
    ROTATE(a1,a3,c1,c3);\
    BUTTERFLY(a0,a1);\
    BUTTERFLY(a2,a3);\
    SRC(0) = a0;\
    SRC(1) = a2;\
    SRC(2) = a3;\
    SRC(3) = a1;\
}\
void fdct4_chen_denorm_##name(dctelem *block, int stride) {\
    LOAD4;\
    BUTTERFLY(a0,a3);\
    BUTTERFLY(a1,a2);\
    BUTTERFLY(a0,a1);\
    ROTATE(a3,a2,c1r2,c3r2);\
    SRC(0) = a0;\
    SRC(1) = a3;\
    SRC(2) = a1;\
    SRC(3) = a2;\
}\
void idct4_chen_denorm_##name(dctelem *block, int stride) {\
    LOAD4;\
    ROTATE(a1,a3,c1r2,c3r2);\
    BUTTERFLY(a0,a2);\
    BUTTERFLY(a0,a1);\
    BUTTERFLY(a2,a3);\
    SRC(0) = a0;\
    SRC(1) = a2;\
    SRC(2) = a3;\
    SRC(3) = a1;\
}\
void test4_##name() {\
    dctelem a[4], b[4], c[4];\
    int i,j;\
    printf("dct4 "#name"\n\n");\
    for(i=0; i<4; i++) {\
        memset(a,0,sizeof(a));\
        a[i] = sample_##name;\
        memcpy(b,a,sizeof(a));\
        memcpy(c,a,sizeof(a));\
        dump_##name(a,4,0,0);\
        fdct4_matrix_##name(a,1);\
        fdct4_chen_##name(b,1);\
        fdct4_chen_denorm_##name(c,1);\
        dump_##name(a,4,0,0);\
        dump_##name(b,4,0,0);\
        dump_##name(c,4,0,c2);\
        idct4_matrix_##name(a,1);\
        idct4_chen_##name(b,1);\
        idct4_chen_denorm_##name(c,1);\
        div_##name(a,4,1);\
        div_##name(b,4,1);\
        div_##name(c,4,2);\
        dump_##name(a,4,0,0);\
        dump_##name(b,4,0,0);\
        dump_##name(c,4,0,0);\
        printf("\n");\
    }\
}

#define MATRIX8(d0,d1,d2,d3,d4,d5,d6,d7)\
    SHIFT(d0*a0 d1*a1 d2*a2 d3*a3 d4*a4 d5*a5 d6*a6 d7*a7)

#define DCT8(name) \
void fdct8_matrix_##name(dctelem *block, int stride) {\
    LOAD8;\
    SRC(0) = SHIFT(c4*(a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7));\
    SRC(1) = MATRIX8(+c1, +c3, +c5, +c7, -c7, -c5, -c3, -c1);\
    SRC(2) = MATRIX8(+c2, +c6, -c6, -c2, -c2, -c6, +c6, +c2);\
    SRC(3) = MATRIX8(+c3, -c7, -c1, -c5, +c5, +c1, +c7, -c3);\
    SRC(4) = SHIFT(c4*(a0 - a1 - a2 + a3 + a4 - a5 - a6 + a7));\
    SRC(5) = MATRIX8(+c5, -c1, +c7, +c3, -c3, -c7, +c1, -c5);\
    SRC(6) = MATRIX8(+c6, -c2, +c2, -c6, -c6, +c2, -c2, +c6);\
    SRC(7) = MATRIX8(+c7, -c5, +c3, -c1, +c1, -c3, +c5, -c7);\
}\
void idct8_matrix_##name(dctelem *block, int stride) {\
    LOAD8;\
    SRC(0) = MATRIX8(+c4, +c1, +c2, +c3, +c4, +c5, +c6, +c7);\
    SRC(1) = MATRIX8(+c4, +c3, +c6, -c7, -c4, -c1, -c2, -c5);\
    SRC(2) = MATRIX8(+c4, +c5, -c6, -c1, -c4, +c7, +c2, +c3);\
    SRC(3) = MATRIX8(+c4, +c7, -c2, -c5, +c4, +c3, -c6, -c1);\
    SRC(4) = MATRIX8(+c4, -c7, -c2, +c5, +c4, -c3, -c6, +c1);\
    SRC(5) = MATRIX8(+c4, -c5, -c6, +c1, -c4, -c7, +c2, -c3);\
    SRC(6) = MATRIX8(+c4, -c3, +c6, +c7, -c4, +c1, -c2, +c5);\
    SRC(7) = MATRIX8(+c4, -c1, +c2, -c3, +c4, -c5, +c6, -c7);\
}\
void fdct8_chen_##name(dctelem *block, int stride) {\
    LOAD8;\
    BUTTERFLY(a0,a7);\
    BUTTERFLY(a1,a6);\
    BUTTERFLY(a2,a5);\
    BUTTERFLY(a3,a4);\
    BUTTERFLY(a0,a3);\
    BUTTERFLY(a1,a2);\
    dctelem b0=a0, b1=a7, b2=a3, b3=a5, b4=a1, b5=a6, b6=a2, b7=a4;\
    ROTATE_MID(b0,b4,c4);\
    SRC(0) = b0;\
    SRC(4) = b4;\
    ROTATE(b2,b6,c2,c6);\
    SRC(2) = b2;\
    SRC(6) = b6;\
    ROTATE_MID(b5,b3,c4);\
    BUTTERFLY(b7,b3);\
    BUTTERFLY(b1,b5);\
    ROTATE(b1,b7,c1,c7);\
    ROTATE(b5,b3,c5,c3);\
    SRC(1) = b1;\
    SRC(3) = b3;\
    SRC(5) = b5;\
    SRC(7) = b7;\
}\
void idct8_chen_##name(dctelem *block, int stride) {\
    LOAD8;\
    ROTATE(a5,a3,c5,c3);\
    ROTATE(a1,a7,c1,c7);\
    ROTATE(a2,a6,c2,c6);\
    ROTATE_MID(a0,a4,c4);\
    BUTTERFLY(a7,a3);\
    BUTTERFLY(a1,a5);\
    ROTATE_MID(a5,a3,c4);\
    dctelem b0=a0, b7=a1, b3=a2, b5=a3, b1=a4, b6=a5, b2=a6, b4=a7;\
    BUTTERFLY(b1,b2);\
    BUTTERFLY(b0,b3);\
    BUTTERFLY(b3,b4);\
    BUTTERFLY(b2,b5);\
    BUTTERFLY(b1,b6);\
    BUTTERFLY(b0,b7);\
    SRC(0) = b0;\
    SRC(1) = b1;\
    SRC(2) = b2;\
    SRC(3) = b3;\
    SRC(4) = b4;\
    SRC(5) = b5;\
    SRC(6) = b6;\
    SRC(7) = b7;\
}\
void fdct8_chen_denorm_##name(dctelem *block, int stride) {\
    LOAD8;\
    BUTTERFLY(a0,a7);\
    BUTTERFLY(a1,a6);\
    BUTTERFLY(a2,a5);\
    BUTTERFLY(a3,a4);\
    BUTTERFLY(a0,a3);\
    BUTTERFLY(a1,a2);\
    BUTTERFLY(a0,a1);\
    dctelem b0=a0, b1=a7, b2=a3, b3=a5, b4=a1, b5=a6, b6=a2, b7=a4;\
    SRC(0) = b0;\
    SRC(4) = b4;\
    ROTATE(b2,b6,c2r2,c6r2);\
    SRC(2) = b2;\
    SRC(6) = b6;\
    ROTATE_MID(b5,b3,c4);\
    BUTTERFLY(b7,b3);\
    BUTTERFLY(b1,b5);\
    ROTATE(b1,b7,c1r2,c7r2);\
    ROTATE(b5,b3,c5r2,c3r2);\
    SRC(1) = b1;\
    SRC(3) = b3;\
    SRC(5) = b5;\
    SRC(7) = b7;\
}\
void idct8_chen_denorm_##name(dctelem *block, int stride) {\
    LOAD8;\
    ROTATE(a5,a3,c5r2,c3r2);\
    ROTATE(a1,a7,c1r2,c7r2);\
    ROTATE(a2,a6,c2r2,c6r2);\
    BUTTERFLY(a7,a3);\
    BUTTERFLY(a1,a5);\
    ROTATE_MID(a5,a3,c4);\
    dctelem b0=a0, b7=a1, b3=a2, b5=a3, b1=a4, b6=a5, b2=a6, b4=a7;\
    BUTTERFLY(b0,b1);\
    BUTTERFLY(b1,b2);\
    BUTTERFLY(b0,b3);\
    BUTTERFLY(b3,b4);\
    BUTTERFLY(b2,b5);\
    BUTTERFLY(b1,b6);\
    BUTTERFLY(b0,b7);\
    SRC(0) = b0;\
    SRC(1) = b1;\
    SRC(2) = b2;\
    SRC(3) = b3;\
    SRC(4) = b4;\
    SRC(5) = b5;\
    SRC(6) = b6;\
    SRC(7) = b7;\
}\
void fdct8_aan_##name(dctelem *block, int stride) {\
    LOAD8;\
    BUTTERFLY(a0,a7);\
    BUTTERFLY(a1,a6);\
    BUTTERFLY(a2,a5);\
    BUTTERFLY(a3,a4);\
    BUTTERFLY(a0,a3);\
    BUTTERFLY(a1,a2);\
    BUTTERFLY(a0,a1);\
    SRC(0) = a0;\
    SRC(4) = a1;\
    a2 += a3;\
    a2 = MUL(c4,a2);\
    BUTTERFLY(a3,a2);\
    SRC(2) = a3;\
    SRC(6) = a2;\
    a4 += a5;\
    a5 += a6;\
    a6 += a7;\
    ROTATE(a4,a6,c6,c2);\
    a5 = MUL(c4,a5);\
    BUTTERFLY(a7,a5);\
    BUTTERFLY(a7,a4);\
    BUTTERFLY(a5,a6);\
    SRC(5) = a5;\
    SRC(3) = a6;\
    SRC(1) = a7;\
    SRC(7) = a4;\
}\
void idct8_aan_##name(dctelem *block, int stride) {\
    LOAD8;\
    dctelem b0=a0, b1=a4, b2=a6, b3=a2, b4=a7, b5=a5, b6=a3, b7=a1;\
    BUTTERFLY(b5,b6);\
    BUTTERFLY(b7,b4);\
    BUTTERFLY(b7,b5);\
    b5 = MUL(r2,b5);\
    ROTATE(b4,b6,c6*2,c2*2);\
    b6 -= b7;\
    b5 -= b6;\
    b4 -= b5;\
    BUTTERFLY(b3,b2);\
    b2 = MUL(r2,b2);\
    b2 -= b3;\
    BUTTERFLY(b0,b1);\
    BUTTERFLY(b1,b2);\
    BUTTERFLY(b0,b3);\
    BUTTERFLY(b3,b4);\
    BUTTERFLY(b2,b5);\
    BUTTERFLY(b1,b6);\
    BUTTERFLY(b0,b7);\
    SRC(0) = b0;\
    SRC(1) = b1;\
    SRC(2) = b2;\
    SRC(3) = b3;\
    SRC(4) = b4;\
    SRC(5) = b5;\
    SRC(6) = b6;\
    SRC(7) = b7;\
}\
void test8_##name() {\
    dctelem a[8], b[8], c[8], d[8], dn[8];\
    int i,j;\
    printf("dct8 "#name"\n\n");\
    for(i=0; i<8; i++) {\
        memset(a,0,sizeof(a));\
        a[i] = sample_##name;\
        memcpy(b,a,sizeof(a));\
        memcpy(c,a,sizeof(a));\
        memcpy(d,a,sizeof(a));\
        dump_##name(a,8,0,0);\
        fdct8_matrix_##name(a,1);\
        fdct8_chen_##name(b,1);\
        fdct8_chen_denorm_##name(c,1);\
        fdct8_aan_##name(d,1);\
        for(j=0; j<8; j++)\
            dn[j] = d[j] * aan_scale[j];\
        dump_##name(a,8,0,0);\
        dump_##name(b,8,0,0);\
        dump_##name(c,8,0,c4);\
        dump_##name(dn,8,0,c4);\
        idct8_matrix_##name(a,1);\
        idct8_chen_##name(b,1);\
        idct8_chen_denorm_##name(c,1);\
        idct8_aan_##name(d,1);\
        div_##name(a,8,2);\
        div_##name(b,8,2);\
        div_##name(c,8,3);\
        div_##name(d,8,3);\
        dump_##name(a,8,0,0);\
        dump_##name(b,8,0,0);\
        dump_##name(c,8,0,0);\
        dump_##name(d,8,0,0);\
        printf("\n");\
    }\
}

#define S0  1.26759f
#define S1  0.99520f
#define S2  1.02608f
#define S3  1.10503f
#define S0i (1/S0)
#define S3i (1/S3)
#define P0 -0.45051f
#define U0  0.74719f
#define P1 -0.39633f
#define U1  0.51035f
#define P2 -0.13821f
#define U2  0.33535f

/* the overlap part of integer lbt is perfectly invertible as long as inputs stay
 * within 14 bits of dynamic range. the dct part is of course not perfect. */
#define LBT8(name)\
void ftdlbt8_lift_##name(dctelem *block, int dcts) {\
    int i;\
    for(i=8; i<dcts*8; i+=8) {\
        BUTTERFLY_LIFT0(block[i-1], block[i+0]);\
        BUTTERFLY_LIFT0(block[i-2], block[i+1]);\
        BUTTERFLY_LIFT0(block[i-3], block[i+2]);\
        BUTTERFLY_LIFT0(block[i-4], block[i+3]);\
        block[i+0] = MUL(S0,block[i+0]);\
        block[i+3] = MUL(S3,block[i+3]);\
        block[i+3] += MUL(P2,block[i+2]);\
        block[i+2] += MUL(U2,block[i+3]) + MUL(P1,block[i+1]);\
        block[i+1] += MUL(U1,block[i+2]) + MUL(P0,block[i+0]);\
        block[i+0] += MUL(U0,block[i+1]);\
        BUTTERFLY_LIFT1(block[i-1], block[i+0]);\
        BUTTERFLY_LIFT1(block[i-2], block[i+1]);\
        BUTTERFLY_LIFT1(block[i-3], block[i+2]);\
        BUTTERFLY_LIFT1(block[i-4], block[i+3]);\
    }\
    for(i=0; i<dcts*8; i+=8)\
        fdct8_chen_##name(block+i, 1);\
}\
void itdlbt8_lift_##name(dctelem *block, int dcts) {\
    int i;\
    for(i=0; i<dcts*8; i+=8)\
        idct8_chen_##name(block+i, 1);\
    for(i=8; i<dcts*8; i+=8) {\
        BUTTERFLY_LIFT0(block[i-1], block[i+0]);\
        BUTTERFLY_LIFT0(block[i-2], block[i+1]);\
        BUTTERFLY_LIFT0(block[i-3], block[i+2]);\
        BUTTERFLY_LIFT0(block[i-4], block[i+3]);\
        block[i+0] -= MUL(U0,block[i+1]);\
        block[i+1] -= MUL(U1,block[i+2]) + MUL(P0,block[i+0]);\
        block[i+2] -= MUL(U2,block[i+3]) + MUL(P1,block[i+1]);\
        block[i+3] -= MUL(P2,block[i+2]);\
        block[i+0] = MUL(S0i,block[i+0]);\
        block[i+3] = MUL(S3i,block[i+3]);\
        BUTTERFLY_LIFT1(block[i-1], block[i+0]);\
        BUTTERFLY_LIFT1(block[i-2], block[i+1]);\
        BUTTERFLY_LIFT1(block[i-3], block[i+2]);\
        BUTTERFLY_LIFT1(block[i-4], block[i+3]);\
    }\
}

#define MATRIX16(d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14,d15)\
    SHIFT(d0*a0 d1*a1 d2*a2 d3*a3 d4*a4 d5*a5 d6*a6 d7*a7 d8*a8 d9*a9 d10*a10 d11*a11 d12*a12 d13*a13 d14*a14 d15*a15)

#define DCT16(name) \
void fdct16_matrix_##name(dctelem *block, int stride) {\
    LOAD16;\
    SRC( 0) = MATRIX16(+ c8, + c8, + c8, + c8, + c8, + c8, + c8, + c8, + c8, + c8, + c8, + c8, + c8, + c8, + c8, + c8);\
    SRC( 1) = MATRIX16(+ c1, + c3, + c5, + c7, + c9, +c11, +c13, +c15, -c15, -c13, -c11, - c9, - c7, - c5, - c3, - c1);\
    SRC( 2) = MATRIX16(+ c2, + c6, +c10, +c14, -c14, -c10, - c6, - c2, - c2, - c6, -c10, -c14, +c14, +c10, + c6, + c2);\
    SRC( 3) = MATRIX16(+ c3, + c9, +c15, -c11, - c5, - c1, - c7, -c13, +c13, + c7, + c1, + c5, +c11, -c15, - c9, - c3);\
    SRC( 4) = MATRIX16(+ c4, +c12, -c12, - c4, - c4, -c12, +c12, + c4, + c4, +c12, -c12, - c4, - c4, -c12, +c12, + c4);\
    SRC( 5) = MATRIX16(+ c5, +c15, - c7, - c3, -c13, + c9, + c1, +c11, -c11, - c1, - c9, +c13, + c3, + c7, -c15, - c5);\
    SRC( 6) = MATRIX16(+ c6, -c14, - c2, -c10, +c10, + c2, +c14, - c6, - c6, +c14, + c2, +c10, -c10, - c2, -c14, + c6);\
    SRC( 7) = MATRIX16(+ c7, -c11, - c3, +c15, + c1, +c13, - c5, - c9, + c9, + c5, -c13, - c1, -c15, + c3, +c11, - c7);\
    SRC( 8) = MATRIX16(+ c8, - c8, - c8, + c8, + c8, - c8, - c8, + c8, + c8, - c8, - c8, + c8, + c8, - c8, - c8, + c8);\
    SRC( 9) = MATRIX16(+ c9, - c5, -c13, + c1, -c15, - c3, +c11, + c7, - c7, -c11, + c3, +c15, - c1, +c13, + c5, - c9);\
    SRC(10) = MATRIX16(+c10, - c2, +c14, + c6, - c6, -c14, + c2, -c10, -c10, + c2, -c14, - c6, + c6, +c14, - c2, +c10);\
    SRC(11) = MATRIX16(+c11, - c1, + c9, +c13, - c3, + c7, +c15, - c5, + c5, -c15, - c7, + c3, -c13, - c9, + c1, -c11);\
    SRC(12) = MATRIX16(+c12, - c4, + c4, -c12, -c12, + c4, - c4, +c12, +c12, - c4, + c4, -c12, -c12, + c4, - c4, +c12);\
    SRC(13) = MATRIX16(+c13, - c7, + c1, - c5, +c11, +c15, - c9, + c3, - c3, + c9, -c15, -c11, + c5, - c1, + c7, -c13);\
    SRC(14) = MATRIX16(+c14, -c10, + c6, - c2, + c2, - c6, +c10, -c14, -c14, +c10, - c6, + c2, - c2, + c6, -c10, +c14);\
    SRC(15) = MATRIX16(+c15, -c13, +c11, - c9, + c7, - c5, + c3, - c1, + c1, - c3, + c5, - c7, + c9, -c11, +c13, -c15);\
}\
void idct16_matrix_##name(dctelem *block, int stride) {\
    LOAD16;\
    SRC( 0) = MATRIX16(+c8, +c1, +c2, +c3, +c4, +c5, +c6, +c7, +c8, +c9, +c10, +c11, +c12, +c13, +c14, +c15);\
    SRC( 1) = MATRIX16(+c8, +c3, +c6, +c9, +c12, +c15, -c14, -c11, -c8, -c5, -c2, -c1, -c4, -c7, -c10, -c13);\
    SRC( 2) = MATRIX16(+c8, +c5, +c10, +c15, -c12, -c7, -c2, -c3, -c8, -c13, +c14, +c9, +c4, +c1, +c6, +c11);\
    SRC( 3) = MATRIX16(+c8, +c7, +c14, -c11, -c4, -c3, -c10, +c15, +c8, +c1, +c6, +c13, -c12, -c5, -c2, -c9);\
    SRC( 4) = MATRIX16(+c8, +c9, -c14, -c5, -c4, -c13, +c10, +c1, +c8, -c15, -c6, -c3, -c12, +c11, +c2, +c7);\
    SRC( 5) = MATRIX16(+c8, +c11, -c10, -c1, -c12, +c9, +c2, +c13, -c8, -c3, -c14, +c7, +c4, +c15, -c6, -c5);\
    SRC( 6) = MATRIX16(+c8, +c13, -c6, -c7, +c12, +c1, +c14, -c5, -c8, +c11, +c2, +c15, -c4, -c9, +c10, +c3);\
    SRC( 7) = MATRIX16(+c8, +c15, -c2, -c13, +c4, +c11, -c6, -c9, +c8, +c7, -c10, -c5, +c12, +c3, -c14, -c1);\
    SRC( 8) = MATRIX16(+c8, -c15, -c2, +c13, +c4, -c11, -c6, +c9, +c8, -c7, -c10, +c5, +c12, -c3, -c14, +c1);\
    SRC( 9) = MATRIX16(+c8, -c13, -c6, +c7, +c12, -c1, +c14, +c5, -c8, -c11, +c2, -c15, -c4, +c9, +c10, -c3);\
    SRC(10) = MATRIX16(+c8, -c11, -c10, +c1, -c12, -c9, +c2, -c13, -c8, +c3, -c14, -c7, +c4, -c15, -c6, +c5);\
    SRC(11) = MATRIX16(+c8, -c9, -c14, +c5, -c4, +c13, +c10, -c1, +c8, +c15, -c6, +c3, -c12, -c11, +c2, -c7);\
    SRC(12) = MATRIX16(+c8, -c7, +c14, +c11, -c4, +c3, -c10, -c15, +c8, -c1, +c6, -c13, -c12, +c5, -c2, +c9);\
    SRC(13) = MATRIX16(+c8, -c5, +c10, -c15, -c12, +c7, -c2, +c3, -c8, +c13, +c14, -c9, +c4, -c1, +c6, -c11);\
    SRC(14) = MATRIX16(+c8, -c3, +c6, -c9, +c12, -c15, -c14, +c11, -c8, +c5, -c2, +c1, -c4, +c7, -c10, +c13);\
    SRC(15) = MATRIX16(+c8, -c1, +c2, -c3, +c4, -c5, +c6, -c7, +c8, -c9, +c10, -c11, +c12, -c13, +c14, -c15);\
}\
void fdct16_llm_##name(dctelem *block, int stride) {\
    LOAD16;\
    BUTTERFLY(a0,a15);\
    BUTTERFLY(a1,a14);\
    BUTTERFLY(a2,a13);\
    BUTTERFLY(a3,a12);\
    BUTTERFLY(a4,a11);\
    BUTTERFLY(a5,a10);\
    BUTTERFLY(a6,a9);\
    BUTTERFLY(a7,a8);\
    BUTTERFLY(a0,a7);\
    BUTTERFLY(a1,a6);\
    BUTTERFLY(a2,a5);\
    BUTTERFLY(a3,a4);\
    BUTTERFLY(a0,a3);\
    BUTTERFLY(a1,a2);\
    BUTTERFLY(a0,a1);\
    SRC(0) = a0;\
    SRC(8) = a1;\
    ROTATE(a3,a2,c4r2,c12r2);\
    SRC(12) = a2;\
    SRC(4) = a3;\
    ROTATE(a7,a4,c2,c14);\
    ROTATE(a6,a5,c6,c10);\
    BUTTERFLY(a7,a6);\
    BUTTERFLY(a4,a5);\
    BUTTERFLY(a6,a4);\
    a5 = SHIFT(r2*a5);\
    a7 = SHIFT(r2*a7);\
    SRC(10) = a4;\
    SRC(14) = a5;\
    SRC(6) = a6;\
    SRC(2) = a7;\
    ROTATE(a15,a8,c9,c7);\
    ROTATE(a9,a14,c5,c11);\
    ROTATE(a13,a10,c13,c3);\
    ROTATE(a11,a12,c1,c15);\
    BUTTERFLY(a15,a12);\
    BUTTERFLY(a14,a13);\
    BUTTERFLY(a11,a8);\
    BUTTERFLY(a10,a9);\
    BUTTERFLY(a15,a9);\
    BUTTERFLY(a14,a8);\
    ROTATE(a15,a14,c4r2,c12r2);\
    ROTATE(a8,a9,c12r2,c4r2);\
    BUTTERFLY(a13,a12);\
    BUTTERFLY(a11,a10);\
    BUTTERFLY(a11,a12);\
    a13 = SHIFT(r2*a13);\
    a10 = SHIFT(r2*a10);\
    SRC(11) = a9;\
    SRC(5) = a8;\
    SRC(7) = a10;\
    SRC(1) = a12;\
    SRC(15) = a11;\
    SRC(9) = a13;\
    SRC(3) = a14;\
    SRC(13) = a15;\
}\
void idct16_llm_##name(dctelem *block, int stride) {\
    LOAD16;\
    dctelem b0=a0, b1=a8, b2=a12, b3=a4, b4=a10, b5=a14, b6=a6, b7=a2, b8=a5, b9=a11, b10=a7, b11=a15, b12=a1, b13=a9, b14=a3, b15=a13;\
    b10 = SHIFT(r2*b10);\
    b13 = SHIFT(r2*b13);\
    BUTTERFLY(b11,b12);\
    BUTTERFLY(b11,b10);\
    BUTTERFLY(b13,b12);\
    ROTATE(b8,b9,c12r2,c4r2);\
    ROTATE(b15,b14,c4r2,c12r2);\
    BUTTERFLY(b14,b8);\
    BUTTERFLY(b15,b9);\
    BUTTERFLY(b10,b9);\
    BUTTERFLY(b11,b8);\
    BUTTERFLY(b14,b13);\
    BUTTERFLY(b15,b12);\
    ROTATE(b11,b12,c1,c15);\
    ROTATE(b13,b10,c13,c3);\
    ROTATE(b9,b14,c5,c11);\
    ROTATE(b15,b8,c9,c7);\
    b7 = SHIFT(r2*b7);\
    b5 = SHIFT(r2*b5);\
    BUTTERFLY(b6,b4);\
    BUTTERFLY(b4,b5);\
    BUTTERFLY(b7,b6);\
    ROTATE(b6,b5,c6,c10);\
    ROTATE(b7,b4,c2,c14);\
    ROTATE(b3,b2,c4r2,c12r2);\
    BUTTERFLY(b0,b1);\
    BUTTERFLY(b1,b2);\
    BUTTERFLY(b0,b3);\
    BUTTERFLY(b3,b4);\
    BUTTERFLY(b2,b5);\
    BUTTERFLY(b1,b6);\
    BUTTERFLY(b0,b7);\
    BUTTERFLY(b7,b8);\
    BUTTERFLY(b6,b9);\
    BUTTERFLY(b5,b10);\
    BUTTERFLY(b4,b11);\
    BUTTERFLY(b3,b12);\
    BUTTERFLY(b2,b13);\
    BUTTERFLY(b1,b14);\
    BUTTERFLY(b0,b15);\
    SRC(0) = b0;\
    SRC(1) = b1;\
    SRC(2) = b2;\
    SRC(3) = b3;\
    SRC(4) = b4;\
    SRC(5) = b5;\
    SRC(6) = b6;\
    SRC(7) = b7;\
    SRC(8) = b8;\
    SRC(9) = b9;\
    SRC(10) = b10;\
    SRC(11) = b11;\
    SRC(12) = b12;\
    SRC(13) = b13;\
    SRC(14) = b14;\
    SRC(15) = b15;\
}\
void test16_##name() {\
    dctelem a[16], b[16];\
    int i,j;\
    printf("dct16 "#name"\n\n");\
    for(i=0; i<16; i++) {\
        memset(a,0,sizeof(a));\
        a[i] = sample_##name;\
        memcpy(b,a,sizeof(a));\
        dump_##name(a,16,0,0);\
        fdct16_matrix_##name(a,1);\
        fdct16_llm_##name(b,1);\
        dump_##name(a,16,0,0);\
        dump_##name(b,16,0,c8);\
        idct16_matrix_##name(a,1);\
        idct16_llm_##name(b,1);\
        div_##name(a,16,3);\
        div_##name(b,16,4);\
        dump_##name(a,16,0,0);\
        dump_##name(b,16,0,0);\
        printf("\n");\
    }\
}

#undef SHIFT
#define dctelem float
#define SHIFT(x) (x)
#define HALF(x) ((x)*.5f)
#define USE_FLOAT
#define N 4
#include "dcts_constants.h"
#undef N
DCT4(float)

#define N 8
#include "dcts_constants.h"
#undef N
DCT8(float)
LBT8(float)

#define N 16
#include "dcts_constants.h"
#undef N
DCT16(float)
#undef dctelem
#undef SHIFT
#undef HALF
#undef USE_FLOAT


#undef S0
#undef S3
#undef S0i
#undef S3i
#undef P0
#undef U0
#undef P1
#undef U1
#undef P2
#undef U2
#define S0  83073
#define S3  72419
#define S0i 51701
#define S3i 59307
#define P0 -29525
#define U0  48968
#define P1 -25974
#define U1  33446
#define P2  -9058
#define U2  21977


#define dctelem int
#define SHIFT(x) (((x)+(1<<15))>>16)
#define HALF(x) ((x)>>1)
#define N 4
#include "dcts_constants.h"
#undef N
DCT4(int)

#define N 8
#include "dcts_constants.h"
#undef N
DCT8(int)
LBT8(int)

#define N 16
#include "dcts_constants.h"
#undef N
DCT16(int)
#undef dctelem
#undef SHIFT
#undef HALF


#define DCT_2D(name, dctelem, n) \
void fdct##n##x##n##_##name(dctelem *block) {\
    int i;\
    for(i=0; i<n; i++)\
        fdct##n##_##name(block+i, n);\
    for(i=0; i<n; i++)\
        fdct##n##_##name(block+i*n, 1);\
}\
void idct##n##x##n##_##name(dctelem *block) {\
    int i;\
    for(i=0; i<n; i++)\
        idct##n##_##name(block+i*n, 1);\
    for(i=0; i<n; i++)\
        idct##n##_##name(block+i*1, n);\
}

#define DCT_2D_IF(name, n) \
        DCT_2D(name##_##float, float, n)\
        DCT_2D(name##_##int, int, n)

DCT_2D_IF(matrix, 4);
DCT_2D_IF(matrix, 8);
DCT_2D_IF(matrix, 16);
DCT_2D_IF(chen, 4);
DCT_2D_IF(chen, 8);
DCT_2D_IF(chen_denorm, 4);
DCT_2D_IF(chen_denorm, 8);
DCT_2D_IF(aan, 8);
DCT_2D_IF(llm, 16);


#define SHIFT(x) (x)
#define USE_FLOAT
#define N 8
#include "dcts_constants.h"
#undef N
#define ct0 0.917754625683454f // cos .13*pi
#define st0 0.397147890635998f // sin .13*pi
#define ct1 0.876306680043077f // cos .16*pi
#define st1 0.481753674103146f // sin .16*pi
#define ct2 ct0
#define st2 st0

void fzs8_float(float *block) {
    ROTATE(block[0], block[2], ct0, -st0);
    ROTATE(block[2], block[4], ct1, -st1);
    ROTATE(block[4], block[6], ct2, -st2);
}

void izs8_float(float *block) {
    ROTATE(block[4], block[6], ct2, -st2);
    ROTATE(block[2], block[4], ct1, -st1);
    ROTATE(block[0], block[2], ct0, -st0);
}

void flbt8_float(float *block, int dcts) {
    int i;
    for(i=0; i<dcts*8; i+=8)
        fdct8_chen_float(block+i, 1);
    for(i=0; i<dcts*8; i+=2) {
        block[i+1] *= r2;
        ROTATE(block[i], block[i+1], c4, c4);
    }
    for(i=8; i<dcts*8; i+=2)
        ROTATE(block[i-7], block[i], c4, c4);
    for(i=8; i<dcts*8; i+=8)
        fzs8_float(block+i);
}

void ilbt8_float(float *block, int dcts) {
    int i;
    for(i=8; i<dcts*8; i+=8)
        izs8_float(block+i);
    for(i=8; i<dcts*8; i+=2)
        ROTATE(block[i-7], block[i], c4, c4);
    for(i=0; i<dcts*8; i+=2) {
        ROTATE(block[i], block[i+1], c4, c4);
        block[i+1] *= c4;
    }
    for(i=0; i<dcts*8; i+=8)
        idct8_chen_float(block+i, 1);
}

// V = J*DCT2*DCT4*J
static const float fv[4][4] = {
    { 0.8072488677, 0.5593674850, 0.1436326789, 0.1217659055 },
    {-0.5717540172, 0.6992029119, 0.4214177954, 0.0813613769 },
    { 0.1217659056,-0.4443051224, 0.8600399285, 0.2193110666 },
    {-0.0813613769,-0.0285703162,-0.2492148003, 0.9646014963 },
};
static const float iv[4][4] = {
    { 0.8072488677,-0.5717540172, 0.1217659056,-0.0813613769 },
    { 0.5593674850, 0.6992029119,-0.4443051224,-0.0285703162 },
    { 0.1436326789, 0.4214177954, 0.8600399285,-0.2492148003 },
    { 0.1217659055, 0.0813613769, 0.2193110666, 0.9646014963 },
};

// V = J*DCT2*S*DCT4*J with s=sqrt(2)
static const float fvs[4][4] = {
    { 0.8358191839, 0.6407288619, 0.2653985844, 0.2653985844 },
    {-0.5431837010, 0.7805642889, 0.5431837009, 0.2249940558 },
    { 0.1503362218,-0.3629437454, 0.9818058341, 0.3629437454 },
    {-0.0527910607, 0.0527910607,-0.1274488948, 1.1082341752 },
};
static const float ivs[4][4] = {
    { 0.7870466034,-0.5919562815, 0.1015636412,-0.1015636412 },
    { 0.5018363037, 0.6416717306,-0.5018363037,-0.0861014975 },
    { 0.0575311813, 0.3353162978, 0.7739384310,-0.3353162979 },
    { 0.0202022643,-0.0202022643, 0.1177474253, 0.8630378551 },
};

void butterflies(float *block) {
    BUTTERFLY(block[-1], block[0]);
    BUTTERFLY(block[-2], block[1]);
    BUTTERFLY(block[-3], block[2]);
    BUTTERFLY(block[-4], block[3]);
}

void vec_mul(const float m[4][4], float *v) {
    int i,j;
    float d[4] = {0};
    for(i=0; i<4; i++)
        for(j=0; j<4; j++)
            d[i] += m[i][j]*v[j];
    for(i=0; i<4; i++)
        v[i] = d[i];
}

void ftdlot8_float(float *block, int dcts) {
    int i;
    for(i=8; i<dcts*8; i+=8) {
        butterflies(block+i);
        vec_mul(fv, block+i);
        butterflies(block+i);
    }
    for(i=4; i<dcts*8-4; i++)
        block[i] *= .5;
    for(i=0; i<dcts*8; i+=8)
        fdct8_chen_float(block+i, 1);
}

void itdlot8_float(float *block, int dcts) {
    int i,j;
    for(i=0; i<dcts*8; i+=8)
        idct8_chen_float(block+i, 1);
    for(i=4; i<dcts*8-4; i++)
        block[i] *= .5;
    for(i=8; i<dcts*8; i+=8) {
        butterflies(block+i);
        vec_mul(iv, block+i);
        butterflies(block+i);
    }
}

void ftdlbt8_float(float *block, int dcts) {
    int i;
    for(i=8; i<dcts*8; i+=8) {
        butterflies(block+i);
        vec_mul(fvs, block+i);
        butterflies(block+i);
    }
    for(i=4; i<dcts*8-4; i++)
        block[i] *= .5;
    for(i=0; i<dcts*8; i+=8)
        fdct8_chen_float(block+i, 1);
}

void itdlbt8_float(float *block, int dcts) {
    int i,j;
    for(i=0; i<dcts*8; i+=8)
        idct8_chen_float(block+i, 1);
    for(i=4; i<dcts*8-4; i++)
        block[i] *= .5;
    for(i=8; i<dcts*8; i+=8) {
        butterflies(block+i);
        vec_mul(ivs, block+i);
        butterflies(block+i);
    }
}

void test4x4_float() {
    float __attribute__((aligned(16))) a[16], b[16], c[16];
    int i,j;
    printf("dct4x4 float\n\n");
    for(i=0; i<16; i++) {
        memset(a,0,sizeof(a));
        a[i] = 1;
        memcpy(b,a,sizeof(a));
        memcpy(c,a,sizeof(a));
        dump_float(a,16,0,0);
        fdct4x4_chen_float(a);
        fdct4x4_chen_float_sse2(b);
        fdct4x4_chen_denorm_float_sse2(c);
        div_float(c,16,1);
        dump_float(a,16,0,0);
        dump_float(b,16,2,0);
        dump_float(c,16,2,0);
        idct4x4_chen_float(a);
        idct4x4_chen_float_sse2(b);
        idct4x4_chen_denorm_float_sse2(c);
        div_float(a,16,2);
        div_float(b,16,2);
        div_float(c,16,3);
        dump_float(a,16,0,0);
        dump_float(b,16,0,0);
        dump_float(c,16,0,0);
        printf("\n");
    }
}

void test8x8_float() {
    float __attribute__((aligned(16))) a[64], b[64], c[64], cn[64];
    int i,j;
    printf("dct8x8 float\n\n");
    for(i=0; i<32; i++) {
        memset(a,0,sizeof(a));
        a[i] = 1;
        memcpy(b,a,sizeof(a));
        memcpy(c,a,sizeof(a));
        dump_float(a,32,0,0);
        fdct8x8_chen_denorm_float(a);
        fdct8x8_chen_denorm_float_sse2(b);
        fdct8x8_aan_float_sse2(c);
        for(j=0; j<64; j++)
            cn[j] = c[j] * aan_scale[j&7] * aan_scale[j>>3];
        dump_float(a,32,0,0);
        dump_float(b,32,3,0);
        dump_float(cn,32,3,0);
        idct8x8_chen_denorm_float(a);
        idct8x8_chen_denorm_float_sse2(b);
        idct8x8_aan_float_sse2(c);
        div_float(a,64,6);
        div_float(b,64,6);
        div_float(c,64,6);
        dump_float(a,32,0,0);
        dump_float(b,32,0,0);
        dump_float(c,32,0,0);
        printf("\n");
    }
}

void test8x8_int() {
    int __attribute__((aligned(16))) a[64];
    int16_t __attribute__((aligned(16))) b[64], c[64], cn[64];
    int i,j;
    printf("dct8x8 int\n\n");
    for(i=0; i<32; i++) {
        memset(a,0,sizeof(a));
        memset(b,0,sizeof(b));
        a[i] = b[i] = 1000;
        memcpy(c,b,sizeof(b));
        dump_int(a,32,0,0);
        fdct8x8_chen_denorm_int(a);
        fdct8x8_chen_denorm_int_ssse3(b);
        fdct8x8_aan_int_ssse3(c);
        for(j=0; j<64; j++)
            cn[j] = c[j] * aan_scale[j&7] * aan_scale[j>>3];
        dump_int(a,32,0,0);
        dump_int16(b,32,3,0);
        dump_int16(cn,32,3,0);
        div_int(a,64,3);
        div_int16(b,64,3);
        div_int16(c,64,3);
        idct8x8_chen_denorm_int(a);
        idct8x8_chen_denorm_int_ssse3(b);
        idct8x8_aan_int_ssse3(c);
        div_int(a,64,3);
        div_int16(b,64,3);
        div_int16(c,64,3);
        dump_int(a,32,0,0);
        dump_int16(b,32,0,0);
        dump_int16(c,32,0,0);
        printf("\n");
    }
}

void test16x16_float() {
    float __attribute__((aligned(16))) a[256], b[256];
    int i,j;
    printf("dct16x16 float\n\n");
    for(i=0; i<32; i++) {
        memset(a,0,sizeof(a));
        a[i] = 1;
        memcpy(b,a,sizeof(a));
        dump_float(a,32,0,0);
        fdct16x16_llm_float(a);
        fdct16x16_llm_float_sse2(b);
        dump_float(a,32,0,0);
        dump_float(b,32,4,0);
        idct16x16_llm_float(a);
        idct16x16_llm_float_sse2(b);
        div_float(a,256,8);
        div_float(b,256,8);
        dump_float(a,32,0,0);
        dump_float(b,32,0,0);
        printf("\n");
    }
}

void test16x16_int() {
    int __attribute__((aligned(16))) a[256];
    int16_t __attribute__((aligned(16))) b[256];
    int i,j;
    printf("dct16x16 int\n\n");
    for(i=0; i<32; i++) {
        memset(a,0,sizeof(a));
        memset(b,0,sizeof(b));
        a[i] = b[i] = 1000;
        dump_int(a,32,0,0);
        fdct16x16_llm_int(a);
        fdct16x16_llm_int_ssse3(b);
        dump_int(a,32,0,0);
        dump_int16(b,32,4,0);
        div_int(a,256,4);
        div_int16(b,256,4);
        idct16x16_llm_int(a);
        idct16x16_llm_int_ssse3(b);
        div_int(a,256,4);
        div_int16(b,256,4);
        dump_int(a,32,0,0);
        dump_int16(b,32,0,0);
        printf("\n");
    }
}

void test8_lbt_float() {
    float __attribute__((aligned(16))) a[32], b[32], c[32], d[32], e[32];
    int i,j,k,l;
    printf("lbt8 float\n\n");
    for(i=4; i<20; i++) {
        memset(a,0,sizeof(a));
        a[i] = 1;
        memcpy(b,a,sizeof(a));
        memcpy(c,a,sizeof(a));
        memcpy(d,a,sizeof(a));
        memcpy(e,a,sizeof(a));
        dump_float(a,32,0,0);
        for(j=0; j<32; j+=8)
            fdct8_chen_float(a+j,1);
        flbt8_float(b,4);
        ftdlot8_float(c,4);
        ftdlbt8_float(d,4);
        ftdlbt8_lift_float(e,4);
        dump_float(d,32,0,.5);
        dump_float(e,32,0,.5);
        for(j=0; j<32; j+=8)
            idct8_chen_float(a+j,1);
        ilbt8_float(b,4);
        itdlot8_float(c,4);
        itdlbt8_float(d,4);
        itdlbt8_lift_float(e,4);
        dump_float(d,32,0,.25);
        dump_float(e,32,0,.25);
        printf("\n");
    }

    for(i=8; i<16; i++) {
        memset(a,0,sizeof(a));
        a[i] = 1;
        itdlbt8_lift_float(a,4);
        dump_float(a+4,16,0,.5);
    }
    printf("\n");
}

void test8_lbt_int() {
    int __attribute__((aligned(16))) a[32], b[32];
    int i,j,k,l;
    printf("lbt8 float\n\n");
    for(i=4; i<20; i++) {
        memset(a,0,sizeof(a));
        a[i] = 1000;
        memcpy(b,a,sizeof(a));
        dump_int(a,32,0,0);
        for(j=0; j<32; j+=8)
            fdct8_chen_int(a+j,1);
        ftdlbt8_lift_int(b,4);
        dump_int(a,32,0,1<<15);
        dump_int(b,32,0,1<<15);
        for(j=0; j<32; j+=8)
            idct8_chen_int(a+j,1);
        itdlbt8_lift_int(b,4);
        dump_int(a,32,0,1<<14);
        dump_int(b,32,0,1<<14);
        printf("\n");
    }

    for(i=8; i<16; i++) {
        memset(a,0,sizeof(a));
        a[i] = 10000;
        itdlbt8_lift_int(a,4);
        dump_int(a+4,16,0,1<<15);
    }
    printf("\n");
}

#if 0
#include "../bench/timer.h"
#define TIME(name)\
    name(a);\
    for(i=0; i<1<<22; i++) {\
        START_TIMER;\
        name(a);\
        STOP_TIMER2(#name, 1<<22);\
    }
#define BENCH(dctelem,simd) \
void bench_##dctelem() {\
    dctelem __attribute__((aligned(16))) b[256] = {0};\
    void *a = b;\
    int i;\
    TIME(fdct8x8_chen_##dctelem);\
    TIME(fdct8x8_chen_denorm_##dctelem);\
    TIME(fdct8x8_aan_##dctelem);\
    TIME(fdct8x8_chen_denorm_##dctelem##_##simd);\
    TIME(fdct8x8_aan_##dctelem##_##simd);\
    TIME(fdct16x16_llm_##dctelem);\
    TIME(fdct16x16_llm_##dctelem##_##simd);\
}
BENCH(float,sse2)
BENCH(int,ssse3)
#endif

int main() {
//  test4_float();
//  test4_int();
//  test8_float();
//  test8_int();
//  test16_float();
//  test16_int();
//  test4x4_float();
//  test8x8_float();
//  test8x8_int();
//  test16x16_float();
//  test16x16_int();
//  bench_int();
//  bench_float();
//  test8_lbt_float();
//  test8_lbt_int();
    return 0;
}
