; Copyright (C) 2007 Loren Merritt
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
; 
; 1. Redistributions of source code must retain the above copyright notice,
; this list of conditions and the following disclaimer.
; 
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
; 
; 3. The name of the author may not be used to endorse or promote products
; derived from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
; WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
; MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
; EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
; OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
; ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

%include "amd64inc.asm"
%include "dcts_constants.asm"

section .text align=16

%ifdef USE_FLOAT
    %define cpusuf float_sse2
    %define ADD addps
    %define SUB subps
    %define MOV movaps
    %macro MUL 2
        mulps %1, [%2 GLOBAL]
    %endmacro
%else
    %define cpusuf int_ssse3
    %define ADD paddsw
    %define SUB psubsw
    %define MOV movdqa
    %macro MUL 2
        %if %2_shift > 0
            psllw %1, %2_shift
        %endif
        pmulhrsw %1, [%2 GLOBAL]
    %endmacro
%endif

%macro cglobal_suf 1
    %define %1 %1_ %+ cpusuf
    global %1
    ALIGN 16
    %1:
%endmacro
%define cglobal cglobal_suf

%macro BUTTERFLY 2-3
%if %0==2
    ADD m%2, m%1
    ADD m%1, m%1
    SUB m%1, m%2
    SWAP %1, %2
%else
    ; faster on core2, same speed on k8, slower on p4
    MOV m%3, m%1
    ADD m%1, m%2
    SUB m%3, m%2
    SWAP %2, %3
%endif
%endmacro

%macro SBUTTERFLY 4
    movaps    m%4, m%2
    punpckl%1 m%2, m%3
    punpckh%1 m%4, m%3
    SWAP %3, %4
%endmacro

%macro TRANSPOSE4x4D 5
    SBUTTERFLY dq,  %1, %2, %5
    SBUTTERFLY dq,  %3, %4, %5
    SBUTTERFLY qdq, %1, %3, %5
    SBUTTERFLY qdq, %2, %4, %5
    SWAP %2, %3
%endmacro

%macro TRANSPOSE8x8W 9
    SBUTTERFLY wd,  %1, %2, %9
    SBUTTERFLY wd,  %3, %4, %9
    SBUTTERFLY wd,  %5, %6, %9
    SBUTTERFLY wd,  %7, %8, %9
    SBUTTERFLY dq,  %1, %3, %9
    SBUTTERFLY dq,  %2, %4, %9
    SBUTTERFLY dq,  %5, %7, %9
    SBUTTERFLY dq,  %6, %8, %9
    SBUTTERFLY qdq, %1, %5, %9
    SBUTTERFLY qdq, %2, %6, %9
    SBUTTERFLY qdq, %3, %7, %9
    SBUTTERFLY qdq, %4, %8, %9
    SWAP %2, %5
    SWAP %4, %7
%endmacro

; in: x,y,a,b,tmp
; out: x=a*x+b*y, y=b*x-a*y
%macro ROTATE 5
%ifidn %3, %4
    BUTTERFLY %1, %2
    MUL  m%1, %4
    MUL  m%2, %4
%else
    movaps m%5, m%1
    ADD  m%5, m%2
    MUL  m%5, %4
    MUL  m%1, %3m%4
    MUL  m%2, %3p%4
    ADD  m%1, m%5
    SUB  m%5, m%2
    SWAP  %5, %2
%endif
%endmacro

%macro FDCT4_CHEN 0
    BUTTERFLY 0, 3
    BUTTERFLY 1, 2
    ROTATE    0, 1, c8, c8, 5
    ROTATE    3, 2, c4, c12, 5
    SWAP 3, 2, 1
%endmacro

%macro IDCT4_CHEN 0
    SWAP 1, 2, 3
    ROTATE    3, 2, c4, c12, 5
    ROTATE    0, 1, c8, c8, 5
    BUTTERFLY 1, 2
    BUTTERFLY 0, 3
%endmacro

%macro FDCT4_CHEN_DENORM 0
    BUTTERFLY 0, 3
    BUTTERFLY 1, 2
    BUTTERFLY 0, 1
    ROTATE    3, 2, c4r2, c12r2, 5
    SWAP 3, 2, 1
%endmacro

%macro IDCT4_CHEN_DENORM 0
    SWAP 1, 2, 3
    ROTATE    3, 2, c4r2, c12r2, 5
    BUTTERFLY 0, 1
    BUTTERFLY 1, 2
    BUTTERFLY 0, 3
%endmacro

%macro FDCT8_CHEN_DENORM 9
    BUTTERFLY %1, %8, %9
    BUTTERFLY %2, %7, %9
    BUTTERFLY %3, %6, %9
    BUTTERFLY %4, %5, %9
    BUTTERFLY %1, %4, %9
    BUTTERFLY %2, %3, %9
    BUTTERFLY %1, %2, %9
    SWAP %8, %5, %2
    SWAP %3, %4, %6, %7
    ROTATE %6, %4, c8, c8, %9
    BUTTERFLY %8, %4, %9
    BUTTERFLY %2, %6, %9
    ROTATE %3, %7, c4r2, c12r2, %9
    ROTATE %2, %8, c2r2, c14r2, %9
    ROTATE %6, %4, c10r2, c6r2, %9
%endmacro

%macro IDCT8_CHEN_DENORM 9
    ROTATE %6, %4, c10r2, c6r2, %9
    ROTATE %2, %8, c2r2, c14r2, %9
    ROTATE %3, %7, c4r2, c12r2, %9
    BUTTERFLY %2, %6, %9
    BUTTERFLY %8, %4, %9
    ROTATE %6, %4, c8, c8, %9
    SWAP %7, %6, %4, %3
    SWAP %2, %5, %8
    BUTTERFLY %1, %2, %9
    BUTTERFLY %2, %3, %9
    BUTTERFLY %1, %4, %9
    BUTTERFLY %4, %5, %9
    BUTTERFLY %3, %6, %9
    BUTTERFLY %2, %7, %9
    BUTTERFLY %1, %8, %9
%endmacro

%macro FDCT8_AAN 9
    BUTTERFLY %1, %8, %9
    BUTTERFLY %2, %7, %9
    BUTTERFLY %3, %6, %9
    BUTTERFLY %4, %5, %9
    BUTTERFLY %1, %4, %9
    BUTTERFLY %2, %3, %9
    BUTTERFLY %1, %2, %9
    ADD m%3, m%4
    MUL m%3, c8
    BUTTERFLY %4, %3, %9
    ADD m%5, m%6
    ADD m%6, m%7
    ADD m%7, m%8
    ROTATE %5, %7, c12, c4, %9
    MUL m%6, c8
    BUTTERFLY %8, %6, %9
    BUTTERFLY %8, %5, %9
    BUTTERFLY %6, %7, %9
    SWAP %2, %8, %5
    SWAP %3, %4, %7
%endmacro

%macro IDCT8_AAN 9
    SWAP %7, %4, %3
    SWAP %5, %8, %2
    BUTTERFLY %6, %7, %9
    BUTTERFLY %8, %5, %9
    BUTTERFLY %8, %6, %9
    MUL m%6, r2
    ROTATE %5, %7, c12t2, c4t2, %9
    SUB m%7, m%8
    SUB m%6, m%7
    SUB m%5, m%6
    BUTTERFLY %4, %3, %9
    MUL m%3, r2
    SUB m%3, m%4
    BUTTERFLY %1, %2, %9
    BUTTERFLY %2, %3, %9
    BUTTERFLY %1, %4, %9
    BUTTERFLY %4, %5, %9
    BUTTERFLY %3, %6, %9
    BUTTERFLY %2, %7, %9
    BUTTERFLY %1, %8, %9
%endmacro

%macro FDCT16_LLM 1
    BUTTERFLY 0, 15
    BUTTERFLY 1, 14
    BUTTERFLY 2, 13
    BUTTERFLY 3, 12
    BUTTERFLY 4, 11
    BUTTERFLY 5, 10
    BUTTERFLY 6, 9
    BUTTERFLY 7, 8
    BUTTERFLY 0, 7
    BUTTERFLY 1, 6
    BUTTERFLY 2, 5
    BUTTERFLY 3, 4
    BUTTERFLY 0, 3
    BUTTERFLY 1, 2
    BUTTERFLY 0, 1
    MOV %1, m0
    ROTATE 3, 2, c4r2, c12r2, 0
    ROTATE 7, 4, c2, c14, 0
    ROTATE 6, 5, c6, c10, 0
    BUTTERFLY 7, 6
    BUTTERFLY 4, 5
    BUTTERFLY 6, 4
    MUL m5, r2
    MUL m7, r2
    ROTATE 15, 8, c9, c7, 0
    ROTATE 9, 14, c5, c11, 0
    ROTATE 13, 10, c13, c3, 0
    ROTATE 11, 12, c1, c15, 0
    BUTTERFLY 15, 12
    BUTTERFLY 14, 13
    BUTTERFLY 11, 8
    BUTTERFLY 10, 9
    BUTTERFLY 15, 9
    BUTTERFLY 14, 8
    ROTATE 15, 14, c4r2, c12r2, 0
    ROTATE 8, 9, c12r2, c4r2, 0
    BUTTERFLY 13, 12
    BUTTERFLY 11, 10
    BUTTERFLY 11, 12
    MUL m13, r2
    MUL m10, r2
    PERMUTE 1,12, 2,7, 3,14, 4,3, 5,8, 7,10, 8,1, 9,13, 10,4, 11,9, 12,2, 13,15, 14,5, 15,11
%endmacro

%macro IDCT16_LLM 1
    PERMUTE 12,1, 7,2, 14,3, 3,4, 8,5, 10,7, 1,8, 13,9, 4,10, 9,11, 2,12, 15,13, 5,14, 11,15
    MUL m10, r2
    MUL m13, r2
    BUTTERFLY 11, 12
    BUTTERFLY 11, 10
    BUTTERFLY 13, 12
    ROTATE 8, 9, c12r2, c4r2, 0
    ROTATE 15, 14, c4r2, c12r2, 0
    BUTTERFLY 14, 8
    BUTTERFLY 15, 9
    BUTTERFLY 10, 9
    BUTTERFLY 11, 8
    BUTTERFLY 14, 13
    BUTTERFLY 15, 12
    ROTATE 11, 12, c1, c15, 0
    ROTATE 13, 10, c13, c3, 0
    ROTATE 9, 14, c5, c11, 0
    ROTATE 15, 8, c9, c7, 0
    MUL m7, r2
    MUL m5, r2
    BUTTERFLY 6, 4
    BUTTERFLY 4, 5
    BUTTERFLY 7, 6
    ROTATE 6, 5, c6, c10, 0
    ROTATE 7, 4, c2, c14, 0
    ROTATE 3, 2, c4r2, c12r2, 0
    MOV m0, %1
    BUTTERFLY 0, 1
    BUTTERFLY 1, 2
    BUTTERFLY 0, 3
    BUTTERFLY 3, 4
    BUTTERFLY 2, 5
    BUTTERFLY 1, 6
    BUTTERFLY 0, 7
    BUTTERFLY 7, 8
    BUTTERFLY 6, 9
    BUTTERFLY 5, 10
    BUTTERFLY 4, 11
    BUTTERFLY 3, 12
    BUTTERFLY 2, 13
    BUTTERFLY 1, 14
    BUTTERFLY 0, 15
%endmacro

%macro LOADS 3-*
    %xdefine %%p %1
    %assign %%stride %2
    %assign %%y 0
    %rep %0-2
        MOV m%3, [%%p+%%y*%%stride]
        %rotate 1
        %assign %%y %%y+1
    %endrep
%endmacro

%macro STORES 3-*
    %xdefine %%p %1
    %assign %%stride %2
    %assign %%y 0
    %rep %0-2
        MOV [%%p+%%y*%%stride], m%3
        %rotate 1
        %assign %%y %%y+1
    %endrep
%endmacro

%ifdef USE_FLOAT

%macro DCT4x4 2
cglobal %1
    LOADS parm1q, 16, 0, 1, 2, 3
    %2
    TRANSPOSE4x4D 0, 1, 2, 3, 4
    %2
    STORES parm1q, 16, 0, 1, 2, 3
    ret
%endmacro

DCT4x4 fdct4x4_chen, FDCT4_CHEN
DCT4x4 idct4x4_chen, IDCT4_CHEN
DCT4x4 fdct4x4_chen_denorm, FDCT4_CHEN_DENORM
DCT4x4 idct4x4_chen_denorm, IDCT4_CHEN_DENORM

%endif ;USE_FLOAT

%ifdef USE_FLOAT

%macro DCT8x8 2
cglobal %1
    LOADS parm1q, 0x20, 0, 1, 2, 3, 4, 5, 6, 7
    %2 0, 1, 2, 3, 4, 5, 6, 7, 8
    TRANSPOSE4x4D 0, 1, 2, 3, 8
    TRANSPOSE4x4D 4, 5, 6, 7, 8
    MOV [parm1q], m0
    LOADS parm1q+0x10, 0x20, 8, 9, 10, 11, 12, 13, 14, 15
    %2 8, 9, 10, 11, 12, 13, 14, 15, 0
    TRANSPOSE4x4D 8, 9, 10, 11, 0
    TRANSPOSE4x4D 12, 13, 14, 15, 0
    %2 4, 5, 6, 7, 12, 13, 14, 15, 0
    STORES parm1q+0x10, 0x20, 4, 5, 6, 7, 12, 13, 14, 15
    MOV m0, [parm1q]
    %2 0, 1, 2, 3, 8, 9, 10, 11, 4
    STORES parm1q, 0x20, 0, 1, 2, 3, 8, 9, 10, 11
    ret
%endmacro

%else

%macro DCT8x8 2
cglobal %1
    LOADS parm1q, 0x10, 0, 1, 2, 3, 4, 5, 6, 7
    %2 0, 1, 2, 3, 4, 5, 6, 7, 8
    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
    %2 0, 1, 2, 3, 4, 5, 6, 7, 8
    STORES parm1q, 0x10, 0, 1, 2, 3, 4, 5, 6, 7
    ret
%endmacro

%endif ;USE_FLOAT

DCT8x8 fdct8x8_chen_denorm, FDCT8_CHEN_DENORM
DCT8x8 idct8x8_chen_denorm, IDCT8_CHEN_DENORM
DCT8x8 fdct8x8_aan, FDCT8_AAN
DCT8x8 idct8x8_aan, IDCT8_AAN

cglobal fdct16col_llm
    RESET_MMPERM
    FDCT16_LLM [parm2q]
    SAVE_MMPERM fdct16col_llm
    ret

cglobal idct16col_llm
    RESET_MMPERM
    IDCT16_LLM [parm2q]
    SAVE_MMPERM idct16col_llm
    ret

%ifdef USE_FLOAT
%define ROWS16 4
%define STRIDE16 0x40
%macro TRANSPOSE_COL16 1
    TRANSPOSE4x4D 4, 5, 6, 7, 0
    TRANSPOSE4x4D 8, 9, 10, 11, 0
    TRANSPOSE4x4D 12, 13, 14, 15, 0
    STORES %1+0x10, STRIDE16, 4, 5, 6, 7
    STORES %1+0x20, STRIDE16, 8, 9, 10, 11
    STORES %1+0x30, STRIDE16, 12, 13, 14, 15
    MOV m0, [%1]
    TRANSPOSE4x4D 0, 1, 2, 3, 4
    STORES %1+0x00, STRIDE16, 0, 1, 2, 3
%endmacro
%else
%define ROWS16 2
%define STRIDE16 0x20
%macro TRANSPOSE_COL16 1
    TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0
    STORES %1+0x10, STRIDE16, 8, 9, 10, 11, 12, 13, 14, 15
    MOV m0, [%1]
    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
    STORES %1+0x00, STRIDE16, 0, 1, 2, 3, 4, 5, 6, 7
%endmacro
%endif ;USE_FLOAT

cglobal fdct16x16_llm
    sub rsp, 8
    mov eax, 16*(ROWS16-1)
.loopv:
    RESET_MMPERM
    LOADS parm1q+rax, STRIDE16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
    sub rsp, 0x100
    mov parm2q, rsp
    call fdct16col_llm
    TRANSPOSE_COL16 rsp
    sub eax, 16
    jge .loopv
    mov eax, 16*(ROWS16-1)
.looph:
    RESET_MMPERM
    LOADS rsp+rax, STRIDE16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
    lea parm2q, [parm1q+rax]
    call fdct16col_llm
    STORES parm1q+rax+STRIDE16, STRIDE16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
    sub eax, 16
    jge .looph
    add rsp, 0x100*ROWS16+8
    ret

cglobal idct16x16_llm
    sub rsp, 8
    mov eax, 16*(ROWS16-1)
.loopv:
    RESET_MMPERM
    lea parm2q, [parm1q+rax]
    LOADS parm2q+STRIDE16, STRIDE16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
    call idct16col_llm
    sub rsp, 0x100
    MOV [rsp], m0
    TRANSPOSE_COL16 rsp
    sub eax, 16
    jge .loopv
    mov eax, 16*(ROWS16-1)
.looph:
    RESET_MMPERM
    lea parm2q, [rsp+rax]
    LOADS parm2q+STRIDE16, STRIDE16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
    call idct16col_llm
    STORES parm1q+rax, STRIDE16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
    sub eax, 16
    jge .looph
    add rsp, 0x100*ROWS16+8
    ret

