;***************************************************************************** ;* ;* This program is free software ; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ;* ;* $Id: mcomp_mmx.asm 323 2005-11-01 20:52:32Z picard $ ;* ;* The Core Pocket Media Player ;* Copyright (c) 2004-2005 Gabor Kovacs ;* ;***************************************************************************** BITS 32 %if 0 SECTION .rdata round7: dd 07070707h,07070707h round8: dd 08080808h,08080808h SECTION .text %macro cglobal 2 %define %1 _%1@%2 global %1 %endmacro cglobal AddBlock4x4_00,12 cglobal AddBlock4x4_01,12 cglobal AddBlock4x4_02,12 cglobal AddBlock4x4_03,12 cglobal AddBlock4x4_10,12 cglobal AddBlock4x4_11,12 cglobal AddBlock4x4_12,12 cglobal AddBlock4x4_13,12 cglobal AddBlock4x4_20,12 cglobal AddBlock4x4_21,12 cglobal AddBlock4x4_22,12 cglobal AddBlock4x4_23,12 cglobal AddBlock4x4_30,12 cglobal AddBlock4x4_31,12 cglobal AddBlock4x4_32,12 cglobal AddBlock4x4_33,12 cglobal CopyBlock4x4,16 cglobal CopyBlock4x4_01,16 cglobal CopyBlock4x4_02,16 cglobal CopyBlock4x4_03,16 cglobal CopyBlock4x4_10,16 cglobal CopyBlock4x4_11,16 cglobal CopyBlock4x4_12,16 cglobal CopyBlock4x4_13,16 cglobal CopyBlock4x4_20,16 cglobal CopyBlock4x4_21,16 cglobal CopyBlock4x4_22,16 cglobal CopyBlock4x4_23,16 cglobal CopyBlock4x4_20,16 cglobal CopyBlock4x4_21,16 cglobal CopyBlock4x4_22,16 cglobal CopyBlock4x4_23,16 cglobal CopyBlock4x4_01R,16 cglobal CopyBlock4x4_02R,16 cglobal CopyBlock4x4_03R,16 cglobal CopyBlock4x4_10R,16 cglobal CopyBlock4x4_11R,16 cglobal CopyBlock4x4_12R,16 cglobal CopyBlock4x4_13R,16 cglobal CopyBlock4x4_20R,16 cglobal CopyBlock4x4_21R,16 cglobal CopyBlock4x4_22R,16 cglobal CopyBlock4x4_23R,16 cglobal CopyBlock4x4_20R,16 cglobal CopyBlock4x4_21R,16 cglobal CopyBlock4x4_22R,16 cglobal CopyBlock4x4_23R,16 %macro loadparam 1 mov esi,[esp+12] ;src mov edi,[esp+12+4] ;dst mov eax,[esp+12+8] ;src pitch %if %1>0 mov edx,8 ;dst pitch (fixed for AddBlock) %else mov edx,[esp+12+12] ;dst pitch %endif %endmacro %macro loadmask1 0 mov ecx,0x01010101 movd mm6,ecx pcmpeqb mm7,mm7 punpckldq mm6,mm6 pxor mm7,mm6 %endmacro %macro loadmask4 0 mov ecx,0x03030303 movd mm6,ecx pcmpeqb mm7,mm7 punpckldq mm6,mm6 pxor mm7,mm6 %endmacro %macro loadmask16 0 mov ecx,0x0F0F0F0F movd mm6,ecx pcmpeqb mm7,mm7 punpckldq mm6,mm6 pxor mm7,mm6 %endmacro %macro load1 2 movd mm0,[esi+%1] %if %2>0 add esi,eax %endif movq mm1,mm0 pand mm1,mm7 psrlq mm1,1 %endmacro %macro load2 2 movd mm2,[esi+%1] %if %2>0 add esi,eax %endif movq mm3,mm2 pand mm3,mm7 psrlq mm3,1 %endmacro %macro load1hv 1 movd mm0,[esi] movd mm4,[esi+1] add esi,eax movq mm1,mm0 movq mm5,mm4 pand mm0,mm6 pand mm4,mm6 pand mm1,mm7 pand mm5,mm7 psrlq mm1,%0 psrlq mm5,%0 paddb mm0,mm4 paddb mm1,mm5 %endmacro %macro load2hv 1 movd mm2,[esi] movd mm4,[esi+1] add esi,eax movq mm3,mm2 movq mm5,mm4 pand mm2,mm6 pand mm4,mm6 pand mm3,mm7 pand mm5,mm7 psrlq mm3,%0 psrlq mm5,%0 paddb mm2,mm4 paddb mm3,mm5 %endmacro %macro avg1 0 por mm0,mm2 pand mm0,mm6 paddb mm0,mm1 paddb mm0,mm3 %endmacro %macro avg2 0 por mm2,mm0 pand mm2,mm6 paddb mm2,mm3 paddb mm2,mm1 %endmacro %macro avground1 0 pand mm0,mm2 pand mm0,mm6 paddb mm0,mm1 paddb mm0,mm3 %endmacro %macro avground2 0 pand mm2,mm0 pand mm2,mm6 paddb mm2,mm3 paddb mm2,mm1 %endmacro %macro save1 0 movd [edi],mm0 add edi,edx %endmacro %macro save2 0 movd [edi],mm2 add edi,edx %endmacro %macro saveadd1 0 movd mm4,[edi] movq mm1,mm0 pand mm0,mm7 por mm1,mm4 pand mm4,mm7 pand mm1,mm6 psrlq mm0,1 psrlq mm4,1 paddb mm1,mm0 paddb mm1,mm4 movd [edi],mm1 add edi,edx %endmacro %macro saveadd2 0 movd mm4,[edi] movq mm3,mm2 pand mm2,mm7 por mm3,mm4 pand mm4,mm7 pand mm3,mm6 psrlq mm2,1 psrlq mm4,1 paddb mm3,mm2 paddb mm3,mm4 movd [edi],mm3 add edi,edx %endmacro %macro CopyBlock4x4_NN 4 ALIGN 16 CopyBlock4x4_%0%1%2: push esi push edi loadparam 0 loadmask16 load1hv 4 %rep 2 load2hv 4 movq mm4,[%3] paddb mm0,mm2 paddb mm1,mm3 paddb mm0,mm4 ;+7 pand mm0,mm7 psrlq mm0,2 paddb mm0,mm1 save1 load1hv 4 movq mm4,[%3] paddb mm2,mm0 paddb mm3,mm1 paddb mm2,mm4 ;+7 pand mm2,mm7 psrlq mm2,2 paddb mm2,mm3 save2 %endrep pop edi pop esi ret 16 %endmacro ALIGN 16 CopyBlock4x4: push esi push edi loadparam 0 push ebx push ecx mov ebx,[esi] mov ecx,[esi+eax] add esi,eax mov [edi],ebx mov [edi+edx],ecx add edi,edx mov ebx,[esi+eax] mov ecx,[esi+eax*2] mov [edi+edx],ebx mov [edi+edx*2],ecx pop ecx pop ebx pop edi pop esi ret 16 ALIGN 16 CopyBlock4x4_02: push esi push edi loadparam 0 loadmask1 %rep 4 load1 0,0 load2 1,1 avg1 save1 %endrep pop edi pop esi ret 16 ALIGN 16 CopyBlock4x4_20: push esi push edi loadparam 0 loadmask1 load1 0,1 %rep 2 load2 0,1 avg1 save1 load1 0,1 avg2 save2 %endrep pop edi pop esi ret 16 ALIGN 16 CopyBlock4x4_22: push esi push edi loadparam 0 loadmask4 load1hv 2 %rep 2 load2hv 2 pcmpeqb mm4,mm4 ;-1 paddb mm0,mm2 paddb mm4,mm4 ;-2 paddb mm1,mm3 psubb mm0,mm4 ;+2 pand mm0,mm7 psrlq mm0,2 paddb mm0,mm1 save1 load1hv 2 pcmpeqb mm4,mm4 ;-1 paddb mm2,mm0 paddb mm4,mm4 ;-2 paddb mm3,mm1 psubb mm2,mm4 ;+2 pand mm2,mm7 psrlq mm2,2 paddb mm2,mm3 save2 %endrep pop edi pop esi ret 16 ALIGN 16 CopyBlock4x4_02R: push esi push edi loadparam 0 loadmask1 %rep 4 load1 0,0 load2 1,1 avground1 save1 %endrep pop edi pop esi ret 16 ALIGN 16 CopyBlock4x4_20R: push esi push edi loadparam 0 loadmask1 load1 0,1 %rep 2 load2 0,1 avground1 save1 load1 0,1 avground2 save2 %endrep pop edi pop esi ret 16 ALIGN 16 CopyBlock4x4_22R: push esi push edi loadparam 0 loadmask4 load1hv 2 %rep 2 load2hv 2 pcmpeqb mm4,mm4 ;-1 paddb mm0,mm2 paddb mm1,mm3 psubb mm0,mm4 ;+1 pand mm0,mm7 psrlq mm0,2 paddb mm0,mm1 save1 load1hv 2 pcmpeqb mm4,mm4 ;-1 paddb mm2,mm0 paddb mm3,mm1 psubb mm2,mm4 ;+1 pand mm2,mm7 psrlq mm2,2 paddb mm2,mm3 save2 %endrep pop edi pop esi ret 16 ALIGN 16 AddBlock4x4: push esi push edi loadparam 1 loadmask1 %rep 4 movd mm0,[esi] add esi,eax saveadd1 %endrep pop edi pop esi ret 12 ALIGN 16 AddBlock4x4_02: push esi push edi loadparam 1 loadmask1 %rep 4 load1 0,0 load2 1,1 avg1 saveadd1 %endrep pop edi pop esi ret 12 ALIGN 16 AddBlock4x4_20: push esi push edi loadparam 1 loadmask1 load1 0,1 %rep 2 load2 0,1 avg1 saveadd1 load1 0,1 avg2 saveadd2 %endrep pop edi pop esi ret 12 ALIGN 16 AddBlock4x4_22: push esi push edi loadparam 1 loadmask4 load1hv 2 %rep 2 load2hv 2 pcmpeqb mm5,mm5 ;-1 paddb mm0,mm2 paddb mm5,mm5 ;-2 paddb mm1,mm3 psubb mm0,mm5 ;+=2 pand mm0,mm7 psrlq mm0,2 paddb mm0,mm1 paddb mm6,mm5 ;0x03-2=0x01 psubb mm7,mm5 ;0xFD+2=0xFF saveadd1 psubb mm6,mm5 ;restore mask paddb mm7,mm5 ;restore mask load1hv 2 pcmpeqb mm5,mm5 ;-1 paddb mm2,mm0 paddb mm5,mm5 ;-2 paddb mm3,mm1 psubb mm2,mm5 ;+=2 pand mm2,mm7 psrlq mm2,2 paddb mm2,mm3 paddb mm6,mm5 ;0x03-2=0x01 psubb mm7,mm5 ;0xFD+2=0xFF saveadd2 psubb mm6,mm5 ;restore mask paddb mm7,mm5 ;restore mask %endrep pop edi pop esi ret 12 %endif