/***************************************************************************** * * This program is free software ; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * $Id: mcomp_mips64.c 284 2005-10-04 08:54:26Z picard $ * * The Core Pocket Media Player * Copyright (c) 2004-2005 Gabor Kovacs * ****************************************************************************/ #include "../common.h" #include "softidct.h" #if defined(MIPS64) // important: disable interrupts before using 64bit registers (but not too long, it could freeze) // important: $8 can't be used as 64bit register (trashed by some kernel routine) // $8 src end pointer // $4 src pointer // $5 dst pointer // $6 src stride // $7 dst stride // $2,$9 first item lower 8 bytes (in two forms) // $10,$11 first item upper 8 bytes (in two forms) - for 16x16 macroblocks // $12,$13 second item lower 8 bytes (in two forms) // $14,$15 second item upper 8 bytes (in two forms) - for 16x16 macroblocks // $24 0x0101 0101 0101 0101 - for non horver // $25 0xFEFE FEFE FEFE FEFE - for non horver // $24 rounding - for horver // $25 temporary - for 16x16 horver // $3 0x0303 0303 0303 0303 - for horver // $1 0xFCFC FCFC FCFC FCFC - for horver // $16 temporary - for 16x16 horver (must be saved/restored) #define SET_SRCEND8 \ "sll $8,$6,3;" \ "addu $8,$4,$8;" #define SET_SRCEND16 \ "sll $8,$6,4;" \ "addu $8,$4,$8;" #define SET_MASKS \ "li $24,0x01010101;" \ "dsll $25,$24,32;" \ "or $24,$24,$25;" \ "nor $25,$24,$0;" #define SET_MASKS2 \ ".set noat;" \ "li $3,0x03030303;" \ "dsll $1,$3,32;" \ "or $3,$3,$1;" \ "nor $1,$3,$0;" #define LOAD_FIRST8(ofs) \ "uld $2, " #ofs "($4);" \ "and $9,$2,$25;" \ "dsrl $9,$9,1;" #define LOAD_FIRST16(ofs) \ "uld $2, " #ofs "($4);" \ "uld $10," #ofs "+8($4);"\ "and $9,$2,$25;" \ "and $11,$10,$25;" \ "dsrl $9,$9,1;" \ "dsrl $11,$11,1;" #define LOAD_SECOND8(ofs) \ "uld $12," #ofs "($4);" \ "and $13,$12,$25;" \ "dsrl $13,$13,1;" #define LOAD_SECOND16(ofs) \ "uld $12," #ofs "($4);" \ "uld $14," #ofs "+8($4);"\ "and $13,$12,$25;" \ "and $15,$14,$25;" \ "dsrl $13,$13,1;" \ "dsrl $15,$15,1;" #define LOAD_FIRST8_HV \ "uld $2,0($4);" \ "uld $9,1($4);" \ "and $10,$2,$1;" \ "and $11,$9,$1;" \ "and $2,$2,$3;" \ "and $9,$9,$3;" \ "dsrl $10,$10,2;" \ "dsrl $11,$11,2;" \ "daddu $2,$2,$9;" \ "daddu $9,$10,$11;" #define LOAD_FIRST16_HV \ "uld $2,0($4);" \ "uld $9,1($4);" \ "and $16,$2,$1;" \ "and $25,$9,$1;" \ "and $2,$2,$3;" \ "and $9,$9,$3;" \ "dsrl $16,$16,2;" \ "dsrl $25,$25,2;" \ "daddu $2,$2,$9;" \ "daddu $9,$16,$25;" \ \ "uld $10,8($4);" \ "uld $11,9($4);" \ "and $16,$10,$1;" \ "and $25,$11,$1;" \ "and $10,$10,$3;" \ "and $11,$11,$3;" \ "dsrl $16,$16,2;" \ "dsrl $25,$25,2;" \ "daddu $10,$10,$11;" \ "daddu $11,$16,$25;" #define LOAD_SECOND8_HV \ "uld $12,0($4);" \ "uld $13,1($4);" \ "and $14,$12,$1;" \ "and $15,$13,$1;" \ "and $12,$12,$3;" \ "and $13,$13,$3;" \ "dsrl $14,$14,2;" \ "dsrl $15,$15,2;" \ "daddu $12,$12,$13;" \ "daddu $13,$14,$15;" #define LOAD_SECOND16_HV \ "uld $12,0($4);" \ "uld $13,1($4);" \ "and $16,$12,$1;" \ "and $25,$13,$1;" \ "and $12,$12,$3;" \ "and $13,$13,$3;" \ "dsrl $16,$16,2;" \ "dsrl $25,$25,2;" \ "daddu $12,$12,$13;" \ "daddu $13,$16,$25;" \ \ "uld $14,8($4);" \ "uld $15,9($4);" \ "and $16,$14,$1;" \ "and $25,$15,$1;" \ "and $14,$14,$3;" \ "and $15,$15,$3;" \ "dsrl $16,$16,2;" \ "dsrl $25,$25,2;" \ "daddu $14,$14,$15;" \ "daddu $15,$16,$25;" #define AVG8 \ "or $2,$2,$12;" \ "and $2,$2,$24;" \ "daddu $2,$2,$9;" \ "daddu $2,$2,$13;" #define AVG16 \ "or $2,$2,$12;" \ "or $10,$10,$14;" \ "and $2,$2,$24;" \ "and $10,$10,$24;" \ "daddu $2,$2,$9;" \ "daddu $10,$10,$11;" \ "daddu $2,$2,$13;" \ "daddu $10,$10,$15;" #define AVGROUND8 \ "and $2,$2,$12;" \ "and $2,$2,$24;" \ "daddu $2,$2,$9;" \ "daddu $2,$2,$13;" #define AVGROUND16 \ "and $2,$2,$12;" \ "and $10,$10,$14;" \ "and $2,$2,$24;" \ "and $10,$10,$24;" \ "daddu $2,$2,$9;" \ "daddu $10,$10,$11;" \ "daddu $2,$2,$13;" \ "daddu $10,$10,$15;" #define SWAPSET8 \ "move $2,$12;" \ "move $9,$13;" #define SWAPSET16 \ "move $2,$12;" \ "move $9,$13;" \ "move $10,$14;" \ "move $11,$15;" #define WRITE8 \ "sdr $2,0($5);" \ "addu $5,$5,$7;" #define WRITE16 \ "sdr $2,0($5);" \ "sdr $10,8($5);" \ "addu $5,$5,$7;" #define SAVE \ "addiu $sp,$sp,-4;" \ "sw $16,0(sp);" #define RESTORE \ "lw $16,0(sp);" \ "addiu $sp,$sp,4;" #ifdef MIPSVR41XX //cache without loading #define CACHE16 \ ".set noreorder;" \ "cache 13,0($5);" \ ".set reorder;" #else #define CACHE16 #endif void CopyBlock(unsigned char * Src, unsigned char * Dst, int SrcStride, int DstStride) { __asm( SET_SRCEND8 "loop:" "uld $2,0($4);" "addu $4,$4,$6;" "sdr $2,0($5);" "addu $5,$5,$7;" "bne $4,$8,loop;"); } void CopyBlockHor(unsigned char * Src, unsigned char * Dst, int SrcStride, int DstStride) { __asm ( SET_SRCEND8 SET_MASKS "loophor:" LOAD_FIRST8(0) LOAD_SECOND8(1) "addu $4,$4,$6;" AVG8 WRITE8 "bne $4,$8,loophor;"); } void CopyBlockHorRound(unsigned char * Src, unsigned char * Dst, int SrcStride, int DstStride) { __asm ( SET_SRCEND8 SET_MASKS "loophorround:" LOAD_FIRST8(0) LOAD_SECOND8(1) "addu $4,$4,$6;" AVGROUND8 WRITE8 "bne $4,$8,loophorround;"); } void CopyBlockVer(unsigned char * Src, unsigned char * Dst, int SrcStride, int DstStride) { __asm ( SET_SRCEND8 SET_MASKS LOAD_FIRST8(0) "loopver:" "addu $4,$4,$6;" LOAD_SECOND8(0) AVG8 WRITE8 SWAPSET8 "bne $4,$8,loopver;"); } void CopyBlockVerRound(unsigned char * Src, unsigned char * Dst, int SrcStride, int DstStride) { __asm ( SET_SRCEND8 SET_MASKS LOAD_FIRST8(0) "loopverround:" "addu $4,$4,$6;" LOAD_SECOND8(0) AVGROUND8 WRITE8 SWAPSET8 "bne $4,$8,loopverround;"); } void CopyBlockHorVer(unsigned char * Src, unsigned char * Dst, int SrcStride, int DstStride) { __asm ( SET_SRCEND8 SET_MASKS2 "dsll $24,$3,1;" "and $24,$24,$3;" // 0x0202 0202 0202 0202 //preprocessing LOAD_FIRST8_HV "loophorver:" "addu $4,$4,$6;" LOAD_SECOND8_HV "daddu $2,$2,$12;" "daddu $9,$9,$13;" "daddu $2,$2,$24;" "and $2,$2,$1;" "dsrl $2,$2,2;" "daddu $2,$2,$9;" WRITE8 SWAPSET8 "bne $4,$8,loophorver;"); } void CopyBlockHorVerRound(unsigned char * Src, unsigned char * Dst, int SrcStride, int DstStride) { __asm ( SET_SRCEND8 SET_MASKS2 "dsrl $24,$3,1;" "and $24,$24,$3;" // 0x0101 0101 0101 0101 //preprocessing LOAD_FIRST8_HV "loophorverround:" "addu $4,$4,$6;" LOAD_SECOND8_HV "daddu $2,$2,$12;" "daddu $9,$9,$13;" "daddu $2,$2,$24;" "and $2,$2,$1;" "dsrl $2,$2,2;" "daddu $2,$2,$9;" WRITE8 SWAPSET8 "bne $4,$8,loophorverround;"); } void CopyMBlock(unsigned char * Src, unsigned char * Dst, int SrcStride, int DstStride) { __asm( SET_SRCEND16 "loopm:" CACHE16 "uld $2,0($4);" "uld $10,8($4);" "addu $4,$4,$6;" WRITE16 "bne $4,$8,loopm;"); } void CopyMBlockHor(unsigned char * Src, unsigned char * Dst, int SrcStride, int DstStride) { __asm ( SET_SRCEND16 SET_MASKS "loopmhor:" LOAD_FIRST16(0) LOAD_SECOND16(1) "addu $4,$4,$6;" CACHE16 AVG16 WRITE16 "bne $4,$8,loopmhor;"); } void CopyMBlockHorRound(unsigned char * Src, unsigned char * Dst, int SrcStride, int DstStride) { __asm ( SET_SRCEND16 SET_MASKS "loopmhorround:" LOAD_FIRST16(0) LOAD_SECOND16(1) "addu $4,$4,$6;" CACHE16 AVGROUND16 WRITE16 "bne $4,$8,loopmhorround;"); } void CopyMBlockVer(unsigned char * Src, unsigned char * Dst, int SrcStride, int DstStride) { __asm ( SET_SRCEND16 SET_MASKS LOAD_FIRST16(0) "loopmver:" "addu $4,$4,$6;" LOAD_SECOND16(0) CACHE16 AVG16 WRITE16 SWAPSET16 "bne $4,$8,loopmver;" ); } void CopyMBlockVerRound(unsigned char * Src, unsigned char * Dst, int SrcStride, int DstStride) { __asm ( SET_SRCEND16 SET_MASKS LOAD_FIRST16(0) "loopmverround:" "addu $4,$4,$6;" LOAD_SECOND16(0) CACHE16 AVGROUND16 WRITE16 SWAPSET16 "bne $4,$8,loopmverround;"); } void CopyMBlockHorVer(unsigned char * Src, unsigned char * Dst, int SrcStride, int DstStride) { __asm ( SAVE SET_SRCEND16 SET_MASKS2 "dsll $24,$3,1;" "and $24,$24,$3;" // 0x0202 0202 0202 0202 //preprocessing LOAD_FIRST16_HV "loopmhorver:" "addu $4,$4,$6;" LOAD_SECOND16_HV CACHE16); __asm ( "daddu $2,$2,$12;" "daddu $9,$9,$13;" "daddu $10,$10,$14;" "daddu $11,$11,$15;" "daddu $2,$2,$24;" "daddu $10,$10,$24;" "and $2,$2,$1;" "and $10,$10,$1;" "dsrl $2,$2,2;" "dsrl $10,$10,2;" "daddu $2,$2,$9;" "daddu $10,$10,$11;" WRITE16 SWAPSET16 "bne $4,$8,loopmhorver;" RESTORE); } void CopyMBlockHorVerRound(unsigned char * Src, unsigned char * Dst, int SrcStride, int DstStride) { __asm ( SAVE SET_SRCEND16 SET_MASKS2 "dsrl $24,$3,1;" "and $24,$24,$3;" // 0x0101 0101 0101 0101 //preprocessing LOAD_FIRST16_HV "loopmhorverround:" "addu $4,$4,$6;"); __asm ( LOAD_SECOND16_HV CACHE16 "daddu $2,$2,$12;" "daddu $9,$9,$13;" "daddu $10,$10,$14;" "daddu $11,$11,$15;" "daddu $2,$2,$24;" "daddu $10,$10,$24;" "and $2,$2,$1;" "and $10,$10,$1;" "dsrl $2,$2,2;" "dsrl $10,$10,2;" "daddu $2,$2,$9;" "daddu $10,$10,$11;" WRITE16 SWAPSET16 "bne $4,$8,loopmhorverround;" RESTORE); } void AddBlock8x8(unsigned char * Src, unsigned char * Dst, int SrcStride, int DstStride) { __asm( SET_SRCEND8 SET_MASKS "loopadd:" "ldr $2,0($4);" "addu $4,$4,$6;" "ldr $9,0($5);" "and $11,$2,$25;" "or $2,$2,$9;" "and $2,$2,$24;" "dsrl $11,$11,1;" "daddu $2,$2,$11;" "and $9,$9,$25;" "dsrl $9,$9,1;" "daddu $2,$2,$9;" "sdr $2,0($5);" "addu $5,$5,$7;" "bne $4,$8,loopadd;"); } void AddBlock16x16(unsigned char * Src, unsigned char * Dst, int SrcStride, int DstStride) { __asm( SET_SRCEND16 SET_MASKS "loopadd16:" "ldr $2,0($4);" "ldr $10,8($4);" #ifdef MIPSVR41XX ".set noreorder;" "cache 17,0($4);" // hit invalidate (lose changes) ".set reorder;" #endif "addu $4,$4,$6;" "ldr $9,0($5);" "and $11,$2,$25;" "or $2,$2,$9;" "and $2,$2,$24;" "dsrl $11,$11,1;" "daddu $2,$2,$11;" "and $9,$9,$25;" "dsrl $9,$9,1;" "daddu $2,$2,$9;" "ldr $11,8($5);" "and $9,$10,$25;" "or $10,$10,$11;" "and $10,$10,$24;" "dsrl $9,$9,1;" "daddu $10,$10,$9;" "and $11,$11,$25;" "dsrl $11,$11,1;" "daddu $10,$10,$11;" "sdr $2,0($5);" "sdr $10,8($5);" "addu $5,$5,$7;" "bne $4,$8,loopadd16;"); } #endif