gps/GPSResources/tcpmp/common/softidct/mcomp4x4_mmx.asm

576 lines
8.4 KiB
NASM
Raw Normal View History

2019-05-01 12:32:35 +00:00
;*****************************************************************************
;*
;* This program is free software ; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
;*
;* $Id: mcomp_mmx.asm 323 2005-11-01 20:52:32Z picard $
;*
;* The Core Pocket Media Player
;* Copyright (c) 2004-2005 Gabor Kovacs
;*
;*****************************************************************************
BITS 32
%if 0
SECTION .rdata
round7: dd 07070707h,07070707h
round8: dd 08080808h,08080808h
SECTION .text
%macro cglobal 2
%define %1 _%1@%2
global %1
%endmacro
cglobal AddBlock4x4_00,12
cglobal AddBlock4x4_01,12
cglobal AddBlock4x4_02,12
cglobal AddBlock4x4_03,12
cglobal AddBlock4x4_10,12
cglobal AddBlock4x4_11,12
cglobal AddBlock4x4_12,12
cglobal AddBlock4x4_13,12
cglobal AddBlock4x4_20,12
cglobal AddBlock4x4_21,12
cglobal AddBlock4x4_22,12
cglobal AddBlock4x4_23,12
cglobal AddBlock4x4_30,12
cglobal AddBlock4x4_31,12
cglobal AddBlock4x4_32,12
cglobal AddBlock4x4_33,12
cglobal CopyBlock4x4,16
cglobal CopyBlock4x4_01,16
cglobal CopyBlock4x4_02,16
cglobal CopyBlock4x4_03,16
cglobal CopyBlock4x4_10,16
cglobal CopyBlock4x4_11,16
cglobal CopyBlock4x4_12,16
cglobal CopyBlock4x4_13,16
cglobal CopyBlock4x4_20,16
cglobal CopyBlock4x4_21,16
cglobal CopyBlock4x4_22,16
cglobal CopyBlock4x4_23,16
cglobal CopyBlock4x4_20,16
cglobal CopyBlock4x4_21,16
cglobal CopyBlock4x4_22,16
cglobal CopyBlock4x4_23,16
cglobal CopyBlock4x4_01R,16
cglobal CopyBlock4x4_02R,16
cglobal CopyBlock4x4_03R,16
cglobal CopyBlock4x4_10R,16
cglobal CopyBlock4x4_11R,16
cglobal CopyBlock4x4_12R,16
cglobal CopyBlock4x4_13R,16
cglobal CopyBlock4x4_20R,16
cglobal CopyBlock4x4_21R,16
cglobal CopyBlock4x4_22R,16
cglobal CopyBlock4x4_23R,16
cglobal CopyBlock4x4_20R,16
cglobal CopyBlock4x4_21R,16
cglobal CopyBlock4x4_22R,16
cglobal CopyBlock4x4_23R,16
%macro loadparam 1
mov esi,[esp+12] ;src
mov edi,[esp+12+4] ;dst
mov eax,[esp+12+8] ;src pitch
%if %1>0
mov edx,8 ;dst pitch (fixed for AddBlock)
%else
mov edx,[esp+12+12] ;dst pitch
%endif
%endmacro
%macro loadmask1 0
mov ecx,0x01010101
movd mm6,ecx
pcmpeqb mm7,mm7
punpckldq mm6,mm6
pxor mm7,mm6
%endmacro
%macro loadmask4 0
mov ecx,0x03030303
movd mm6,ecx
pcmpeqb mm7,mm7
punpckldq mm6,mm6
pxor mm7,mm6
%endmacro
%macro loadmask16 0
mov ecx,0x0F0F0F0F
movd mm6,ecx
pcmpeqb mm7,mm7
punpckldq mm6,mm6
pxor mm7,mm6
%endmacro
%macro load1 2
movd mm0,[esi+%1]
%if %2>0
add esi,eax
%endif
movq mm1,mm0
pand mm1,mm7
psrlq mm1,1
%endmacro
%macro load2 2
movd mm2,[esi+%1]
%if %2>0
add esi,eax
%endif
movq mm3,mm2
pand mm3,mm7
psrlq mm3,1
%endmacro
%macro load1hv 1
movd mm0,[esi]
movd mm4,[esi+1]
add esi,eax
movq mm1,mm0
movq mm5,mm4
pand mm0,mm6
pand mm4,mm6
pand mm1,mm7
pand mm5,mm7
psrlq mm1,%0
psrlq mm5,%0
paddb mm0,mm4
paddb mm1,mm5
%endmacro
%macro load2hv 1
movd mm2,[esi]
movd mm4,[esi+1]
add esi,eax
movq mm3,mm2
movq mm5,mm4
pand mm2,mm6
pand mm4,mm6
pand mm3,mm7
pand mm5,mm7
psrlq mm3,%0
psrlq mm5,%0
paddb mm2,mm4
paddb mm3,mm5
%endmacro
%macro avg1 0
por mm0,mm2
pand mm0,mm6
paddb mm0,mm1
paddb mm0,mm3
%endmacro
%macro avg2 0
por mm2,mm0
pand mm2,mm6
paddb mm2,mm3
paddb mm2,mm1
%endmacro
%macro avground1 0
pand mm0,mm2
pand mm0,mm6
paddb mm0,mm1
paddb mm0,mm3
%endmacro
%macro avground2 0
pand mm2,mm0
pand mm2,mm6
paddb mm2,mm3
paddb mm2,mm1
%endmacro
%macro save1 0
movd [edi],mm0
add edi,edx
%endmacro
%macro save2 0
movd [edi],mm2
add edi,edx
%endmacro
%macro saveadd1 0
movd mm4,[edi]
movq mm1,mm0
pand mm0,mm7
por mm1,mm4
pand mm4,mm7
pand mm1,mm6
psrlq mm0,1
psrlq mm4,1
paddb mm1,mm0
paddb mm1,mm4
movd [edi],mm1
add edi,edx
%endmacro
%macro saveadd2 0
movd mm4,[edi]
movq mm3,mm2
pand mm2,mm7
por mm3,mm4
pand mm4,mm7
pand mm3,mm6
psrlq mm2,1
psrlq mm4,1
paddb mm3,mm2
paddb mm3,mm4
movd [edi],mm3
add edi,edx
%endmacro
%macro CopyBlock4x4_NN 4
ALIGN 16
CopyBlock4x4_%0%1%2:
push esi
push edi
loadparam 0
loadmask16
load1hv 4
%rep 2
load2hv 4
movq mm4,[%3]
paddb mm0,mm2
paddb mm1,mm3
paddb mm0,mm4 ;+7
pand mm0,mm7
psrlq mm0,2
paddb mm0,mm1
save1
load1hv 4
movq mm4,[%3]
paddb mm2,mm0
paddb mm3,mm1
paddb mm2,mm4 ;+7
pand mm2,mm7
psrlq mm2,2
paddb mm2,mm3
save2
%endrep
pop edi
pop esi
ret 16
%endmacro
ALIGN 16
CopyBlock4x4:
push esi
push edi
loadparam 0
push ebx
push ecx
mov ebx,[esi]
mov ecx,[esi+eax]
add esi,eax
mov [edi],ebx
mov [edi+edx],ecx
add edi,edx
mov ebx,[esi+eax]
mov ecx,[esi+eax*2]
mov [edi+edx],ebx
mov [edi+edx*2],ecx
pop ecx
pop ebx
pop edi
pop esi
ret 16
ALIGN 16
CopyBlock4x4_02:
push esi
push edi
loadparam 0
loadmask1
%rep 4
load1 0,0
load2 1,1
avg1
save1
%endrep
pop edi
pop esi
ret 16
ALIGN 16
CopyBlock4x4_20:
push esi
push edi
loadparam 0
loadmask1
load1 0,1
%rep 2
load2 0,1
avg1
save1
load1 0,1
avg2
save2
%endrep
pop edi
pop esi
ret 16
ALIGN 16
CopyBlock4x4_22:
push esi
push edi
loadparam 0
loadmask4
load1hv 2
%rep 2
load2hv 2
pcmpeqb mm4,mm4 ;-1
paddb mm0,mm2
paddb mm4,mm4 ;-2
paddb mm1,mm3
psubb mm0,mm4 ;+2
pand mm0,mm7
psrlq mm0,2
paddb mm0,mm1
save1
load1hv 2
pcmpeqb mm4,mm4 ;-1
paddb mm2,mm0
paddb mm4,mm4 ;-2
paddb mm3,mm1
psubb mm2,mm4 ;+2
pand mm2,mm7
psrlq mm2,2
paddb mm2,mm3
save2
%endrep
pop edi
pop esi
ret 16
ALIGN 16
CopyBlock4x4_02R:
push esi
push edi
loadparam 0
loadmask1
%rep 4
load1 0,0
load2 1,1
avground1
save1
%endrep
pop edi
pop esi
ret 16
ALIGN 16
CopyBlock4x4_20R:
push esi
push edi
loadparam 0
loadmask1
load1 0,1
%rep 2
load2 0,1
avground1
save1
load1 0,1
avground2
save2
%endrep
pop edi
pop esi
ret 16
ALIGN 16
CopyBlock4x4_22R:
push esi
push edi
loadparam 0
loadmask4
load1hv 2
%rep 2
load2hv 2
pcmpeqb mm4,mm4 ;-1
paddb mm0,mm2
paddb mm1,mm3
psubb mm0,mm4 ;+1
pand mm0,mm7
psrlq mm0,2
paddb mm0,mm1
save1
load1hv 2
pcmpeqb mm4,mm4 ;-1
paddb mm2,mm0
paddb mm3,mm1
psubb mm2,mm4 ;+1
pand mm2,mm7
psrlq mm2,2
paddb mm2,mm3
save2
%endrep
pop edi
pop esi
ret 16
ALIGN 16
AddBlock4x4:
push esi
push edi
loadparam 1
loadmask1
%rep 4
movd mm0,[esi]
add esi,eax
saveadd1
%endrep
pop edi
pop esi
ret 12
ALIGN 16
AddBlock4x4_02:
push esi
push edi
loadparam 1
loadmask1
%rep 4
load1 0,0
load2 1,1
avg1
saveadd1
%endrep
pop edi
pop esi
ret 12
ALIGN 16
AddBlock4x4_20:
push esi
push edi
loadparam 1
loadmask1
load1 0,1
%rep 2
load2 0,1
avg1
saveadd1
load1 0,1
avg2
saveadd2
%endrep
pop edi
pop esi
ret 12
ALIGN 16
AddBlock4x4_22:
push esi
push edi
loadparam 1
loadmask4
load1hv 2
%rep 2
load2hv 2
pcmpeqb mm5,mm5 ;-1
paddb mm0,mm2
paddb mm5,mm5 ;-2
paddb mm1,mm3
psubb mm0,mm5 ;+=2
pand mm0,mm7
psrlq mm0,2
paddb mm0,mm1
paddb mm6,mm5 ;0x03-2=0x01
psubb mm7,mm5 ;0xFD+2=0xFF
saveadd1
psubb mm6,mm5 ;restore mask
paddb mm7,mm5 ;restore mask
load1hv 2
pcmpeqb mm5,mm5 ;-1
paddb mm2,mm0
paddb mm5,mm5 ;-2
paddb mm3,mm1
psubb mm2,mm5 ;+=2
pand mm2,mm7
psrlq mm2,2
paddb mm2,mm3
paddb mm6,mm5 ;0x03-2=0x01
psubb mm7,mm5 ;0xFD+2=0xFF
saveadd2
psubb mm6,mm5 ;restore mask
paddb mm7,mm5 ;restore mask
%endrep
pop edi
pop esi
ret 12
%endif