836 lines
19 KiB
NASM
Executable File
836 lines
19 KiB
NASM
Executable File
;*****************************************************************************
|
|
;*
|
|
;* This program is free software ; you can redistribute it and/or modify
|
|
;* it under the terms of the GNU General Public License as published by
|
|
;* the Free Software Foundation; either version 2 of the License, or
|
|
;* (at your option) any later version.
|
|
;*
|
|
;* This program is distributed in the hope that it will be useful,
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
;* GNU General Public License for more details.
|
|
;*
|
|
;* You should have received a copy of the GNU General Public License
|
|
;* along with this program; if not, write to the Free Software
|
|
;* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
;*
|
|
;* $Id: idct_arm.asm 284 2005-10-04 08:54:26Z picard $
|
|
;*
|
|
;* The Core Pocket Media Player
|
|
;* Copyright (c) 2004-2005 Gabor Kovacs
|
|
;*
|
|
;*****************************************************************************
|
|
|
|
AREA |.text|, CODE
|
|
|
|
EXPORT IDCT_Block4x8
|
|
EXPORT IDCT_Block8x8
|
|
EXPORT IDCT_Block4x8Swap
|
|
EXPORT IDCT_Block8x8Swap
|
|
|
|
; r6 Block
|
|
; r7,r8 must be saved
|
|
|
|
macro
|
|
MCol8 $Name,$Rotate,$Pitch
|
|
|
|
$Name PROC
|
|
|
|
; r10 = x0
|
|
; r4 = x1
|
|
; r2 = x2
|
|
; r1 = x3
|
|
; r3 = x4
|
|
; r12 = x5
|
|
; r0 = x6
|
|
; r5 = x7
|
|
; r11 = x8
|
|
; r9 = tmp (x567)
|
|
|
|
ldrsh r4, [r6, #4*$Pitch]
|
|
ldrsh r0, [r6, #5*$Pitch]
|
|
ldrsh r12,[r6, #7*$Pitch]
|
|
ldrsh r5, [r6, #3*$Pitch]
|
|
ldrsh r2, [r6, #6*$Pitch]
|
|
ldrsh r1, [r6, #2*$Pitch]
|
|
ldrsh r3, [r6, #1*$Pitch]
|
|
ldrsh r10,[r6]
|
|
if $Rotate
|
|
add r6,r6,r9
|
|
endif
|
|
|
|
orr r9, r12, r0
|
|
orr r9, r9, r5
|
|
orr r11, r9, r2
|
|
orr r11, r11, r4
|
|
orrs r11, r11, r1
|
|
|
|
bne $Name.Mode2
|
|
cmp r3, #0
|
|
bne $Name.Mode1
|
|
if $Rotate=0
|
|
cmp r10, #0
|
|
beq $Name.Zero
|
|
endif
|
|
mov r10, r10, lsl #3
|
|
strh r10, [r6]
|
|
strh r10, [r6, #0x10]
|
|
strh r10, [r6, #0x20]
|
|
strh r10, [r6, #0x30]
|
|
strh r10, [r6, #0x40]
|
|
strh r10, [r6, #0x50]
|
|
strh r10, [r6, #0x60]
|
|
strh r10, [r6, #0x70]
|
|
$Name.Zero
|
|
mov pc,lr
|
|
|
|
$Name.Mode1 ;x0,x4
|
|
mov r11, r3
|
|
mov r2, #0x8D, 30 ; 0x234 = 564
|
|
orr r2, r2, #1
|
|
mov r9, r3
|
|
mul r2, r11, r2
|
|
mov r11, #0xB1, 28 ; 0xB10 = 2832
|
|
orr r11, r11, #9
|
|
mul r4, r9, r11
|
|
mov r11, #0x96, 28 ; 0x960 = 2400
|
|
orr r11, r11, #8
|
|
mul r5, r9, r11
|
|
mov r11, #0x19, 26 ; 0x640 = 1600
|
|
mov r1, r10, lsl #11
|
|
orr r11, r11, #9
|
|
mul r0, r3, r11
|
|
add r1, r1, #0x80 ; 0x80 = 128
|
|
|
|
add r3, r4, r1
|
|
add r11, r5, r1
|
|
mov r3, r3, asr #8
|
|
mov r11, r11, asr #8
|
|
strh r3, [r6]
|
|
strh r11, [r6, #0x10] ; 0x10 = 16
|
|
|
|
add r3, r0, r1
|
|
add r11, r2, r1
|
|
mov r3, r3, asr #8
|
|
mov r11, r11, asr #8
|
|
strh r3, [r6, #0x20] ; 0x20 = 32
|
|
strh r11, [r6, #0x30] ; 0x30 = 48
|
|
|
|
sub r3, r1, r2
|
|
sub r11, r1, r0
|
|
mov r3, r3, asr #8
|
|
mov r11, r11, asr #8
|
|
strh r3, [r6, #0x40] ; 0x40 = 64
|
|
strh r11, [r6, #0x50] ; 0x50 = 80
|
|
|
|
sub r3, r1, r5
|
|
sub r11, r1, r4
|
|
mov r3, r3, asr #8
|
|
mov r11, r11, asr #8
|
|
strh r3, [r6, #0x60] ; 0x60 = 96
|
|
strh r11, [r6, #0x70] ; 0x70 = 112
|
|
mov pc,lr
|
|
|
|
$Name.Mode2 ;x0,x1,x2,x3
|
|
orrs r11, r9, r3
|
|
bne $Name.Mode3
|
|
mov r3, r10, lsl #11
|
|
add r3, r3, #128
|
|
mov r9, #0x45, 28 ; 0x450 = 1104
|
|
add r5, r3, r4, lsl #11
|
|
add r11, r2, r1
|
|
orr r9, r9, #4
|
|
sub r3, r3, r4, lsl #11
|
|
mul r4, r11, r9
|
|
mov r11, #0x3B, 26 ; 0xEC0 = 3776
|
|
orr r11, r11, #8
|
|
mul r11, r2, r11
|
|
sub r2, r4, r11
|
|
mov r11, #0x62, 28 ; 0x620 = 1568
|
|
mul r11, r1, r11
|
|
add r0, r2, r3
|
|
add r1, r11, r4
|
|
add r4, r5, r1
|
|
sub r3, r3, r2
|
|
sub r5, r5, r1
|
|
mov r1, r4, asr #8
|
|
mov r3, r3, asr #8
|
|
mov r2, r0, asr #8
|
|
mov r4, r5, asr #8
|
|
strh r1, [r6,#0x00]
|
|
strh r2, [r6,#0x10]
|
|
strh r3, [r6,#0x20]
|
|
strh r4, [r6,#0x30]
|
|
strh r4, [r6,#0x40]
|
|
strh r3, [r6,#0x50]
|
|
strh r2, [r6,#0x60]
|
|
strh r1, [r6,#0x70]
|
|
mov pc,lr
|
|
|
|
$Name.Mode3 ;x0,x1,x2,x3,x4,x5,x6,x7
|
|
|
|
mov r9, #0x8D, 30
|
|
orr r9, r9, #1 ;W7
|
|
add r11, r12, r3
|
|
mul r11, r9, r11 ;x8 = W7 * (x5 + x4)
|
|
|
|
mov r9, #0x8E, 28
|
|
orr r9, r9, #4 ;W1-W7
|
|
mla r3, r9, r3, r11 ;x4 = x8 + (W1-W7) * x4
|
|
|
|
mvn r9, #0xD40
|
|
eor r9, r9, #0xD ;-W1-W7
|
|
mla r12, r9, r12, r11 ;x5 = x8 + (-W1-W7) * x5
|
|
|
|
mov r9, #0x96, 28 ;
|
|
orr r9, r9, #8 ;W3
|
|
add r11, r0, r5
|
|
mul r11, r9, r11 ;x8 = W3 * (x6 + x7)
|
|
|
|
mvn r9, #0x310
|
|
eor r9, r9, #0xE ;W5-W3
|
|
mla r0, r9, r0, r11 ;x6 = x8 + (W5-W3) * x6
|
|
|
|
mvn r9, #0xFB0 ;-W3-W5
|
|
mla r5, r9, r5, r11 ;x7 = x8 + (-W3-W5) * x7
|
|
|
|
mov r10, r10, lsl #11
|
|
add r10, r10, #128 ;x0 = (x0 << 11) + 128
|
|
add r11, r10,r4,lsl #11 ;x8 = x0 + (x1 << 11)
|
|
sub r10, r10,r4,lsl #11 ;x0 = x0 - (x1 << 11)
|
|
|
|
mov r9, #0x45, 28
|
|
orr r9, r9, #4 ;W6
|
|
add r4, r1, r2
|
|
mul r4, r9, r4 ;x1 = W6 * (x3 + x2)
|
|
|
|
mvn r9, #0xEC0
|
|
eor r9, r9, #0x7 ;-W2-W6
|
|
mla r2, r9, r2, r4 ;x2 = x1 + (-W2-W6) * x2
|
|
|
|
mov r9, #0x620 ;W2-W6
|
|
mla r1, r9, r1, r4 ;x3 = x1 + (W2-W6) * x3
|
|
|
|
add r4, r3, r0 ;x1 = x4 + x6
|
|
sub r3, r3, r0 ;x4 -= x6
|
|
add r0, r12,r5 ;x6 = x5 + x7
|
|
sub r12,r12,r5 ;x5 -= x7
|
|
add r5, r11,r1 ;x7 = x8 + x3
|
|
sub r11,r11,r1 ;x8 -= x3
|
|
add r1, r10,r2 ;x3 = x0 + x2
|
|
sub r10,r10,r2 ;x0 -= x2
|
|
|
|
add r9, r3, r12 ;x4 + x5
|
|
sub r3, r3, r12 ;x4 - x5
|
|
mov r12, #181
|
|
mul r2, r9, r12 ;181 * (x4 + x5)
|
|
mul r9, r3, r12 ;181 * (x4 - x5)
|
|
add r2, r2, #128 ;x2 = 181 * (x4 + x5) + 128
|
|
add r3, r9, #128 ;x4 = 181 * (x4 - x5) + 128
|
|
|
|
add r9,r5,r4
|
|
sub r5,r5,r4
|
|
mov r9,r9,asr #8 ;(x7 + x1) >> 8
|
|
mov r5,r5,asr #8 ;(x7 - x1) >> 8
|
|
strh r9,[r6,#0x00]
|
|
strh r5,[r6,#0x70]
|
|
|
|
add r9,r1,r2,asr #8
|
|
sub r1,r1,r2,asr #8
|
|
mov r9,r9,asr #8 ;(x3 + x2) >> 8
|
|
mov r1,r1,asr #8 ;(x3 - x2) >> 8
|
|
strh r9,[r6,#0x10]
|
|
strh r1,[r6,#0x60]
|
|
|
|
add r9,r10,r3,asr #8
|
|
sub r10,r10,r3,asr #8
|
|
mov r9,r9,asr #8 ;(x0 + x4) >> 8
|
|
mov r10,r10,asr #8 ;(x0 - x4) >> 8
|
|
strh r9,[r6,#0x20]
|
|
strh r10,[r6,#0x50]
|
|
|
|
add r9,r11,r0
|
|
sub r11,r11,r0
|
|
mov r9,r9,asr #8 ;(x8 + x6) >> 8
|
|
mov r11,r11,asr #8 ;(x8 - x6) >> 8
|
|
strh r9,[r6,#0x30]
|
|
strh r11,[r6,#0x40]
|
|
|
|
mov pc,lr
|
|
mend
|
|
|
|
MCol8 Col8,0,16
|
|
MCol8 Col8Swap,1,2
|
|
|
|
; r0 Block[0]
|
|
; r6 Block
|
|
; r7 Src
|
|
; r8 Dst
|
|
|
|
ALIGN 16
|
|
RowConst PROC
|
|
|
|
add r0, r0, #0x20 ; 0x20 = 32
|
|
cmp r7, #0
|
|
mov r3, r0, asr #6
|
|
beq RowConst_NoSrc
|
|
cmp r3, #0
|
|
beq RowConst_Zero
|
|
blt RowConst_Sub
|
|
|
|
RowConst_Add
|
|
ldr r0, CarryMask
|
|
ldr r2, [r7]
|
|
orr r3, r3, r3, lsl #8
|
|
orr r3, r3, r3, lsl #16
|
|
add r4, r2, r3
|
|
eor r11, r2, r3
|
|
and r2, r3, r2
|
|
bic r11, r11, r4
|
|
orr r11, r11, r2
|
|
and r5, r11, r0
|
|
mov r12, r5, lsl #1
|
|
sub r10, r4, r12
|
|
sub r11, r12, r5, lsr #7
|
|
ldr r2, [r7, #4]
|
|
orr r11, r11, r10
|
|
str r11, [r8]
|
|
add r4, r2, r3
|
|
eor r11, r2, r3
|
|
and r2, r3, r2
|
|
bic r11, r11, r4
|
|
orr r11, r11, r2
|
|
and r5, r11, r0
|
|
mov r12, r5, lsl #1
|
|
sub r10, r4, r12
|
|
sub r11, r12, r5, lsr #7
|
|
orr r11, r11, r10
|
|
str r11, [r8, #4]
|
|
add r7, r7, #8 ;source stride
|
|
mov pc,lr
|
|
|
|
RowConst_Sub
|
|
ldr r0, CarryMask
|
|
ldr r2, [r7]
|
|
rsb r3, r3, #0
|
|
orr r3, r3, r3, lsl #8
|
|
orr r3, r3, r3, lsl #16
|
|
mvn r2, r2
|
|
add r4, r2, r3
|
|
eor r11, r2, r3
|
|
and r2, r3, r2
|
|
bic r11, r11, r4
|
|
orr r11, r11, r2
|
|
and r5, r11, r0
|
|
mov r12, r5, lsl #1
|
|
sub r10, r4, r12
|
|
sub r11, r12, r5, lsr #7
|
|
ldr r2, [r7, #4]
|
|
orr r11, r11, r10
|
|
mvn r11, r11
|
|
str r11, [r8]
|
|
mvn r2, r2
|
|
add r4, r2, r3
|
|
eor r11, r2, r3
|
|
and r2, r3, r2
|
|
bic r11, r11, r4
|
|
orr r11, r11, r2
|
|
and r5, r11, r0
|
|
mov r12, r5, lsl #1
|
|
sub r10, r4, r12
|
|
sub r11, r12, r5, lsr #7
|
|
orr r11, r11, r10
|
|
mvn r11, r11
|
|
str r11, [r8, #4]
|
|
add r7, r7, #8 ;source stride
|
|
mov pc,lr
|
|
|
|
RowConst_Zero
|
|
ldr r1, [r7]
|
|
ldr r2, [r7, #4]
|
|
str r1, [r8]
|
|
str r2, [r8, #4]
|
|
add r7, r7, #8 ;source stride
|
|
mov pc,lr
|
|
|
|
RowConst_NoSrc
|
|
cmp r3, #0
|
|
movmi r3, #0
|
|
cmppl r3, #255
|
|
movgt r3, #255
|
|
orr r3, r3, r3, lsl #8
|
|
orr r3, r3, r3, lsl #16
|
|
str r3, [r8]
|
|
str r3, [r8, #4]
|
|
mov pc,lr
|
|
|
|
ENDP
|
|
|
|
CarryMask DCD 0x80808080
|
|
W1 DCW 2841 ; 2048*sqrt(2)*cos(1*pi/16)
|
|
W3 DCW 2408 ; 2048*sqrt(2)*cos(3*pi/16)
|
|
nW5 DCW 0xF9B7 ;-1609 ; 2048*sqrt(2)*cos(5*pi/16)
|
|
W6 DCW 1108 ; 2048*sqrt(2)*cos(6*pi/16)
|
|
W7 DCW 565 ; 2048*sqrt(2)*cos(7*pi/16)
|
|
W2 DCW 2676 ; 2048*sqrt(2)*cos(2*pi/16)
|
|
|
|
; r6 Block
|
|
; r7 Src
|
|
; r8 Dst
|
|
|
|
ALIGN 16
|
|
IDCT_Block4x8Swap PROC
|
|
|
|
add r0, r0, #256
|
|
stmdb sp!, {r0, r2, r4 - r12, lr} ; r0=BlockEnd r2=DstStride
|
|
sub r6, r0, #256 ;Block
|
|
mov r7, r3 ;Src
|
|
mov r8, r1 ;Dst
|
|
|
|
mov r9,#128-0*16+0*2
|
|
bl Col8Swap
|
|
mov r9,#128-1*16+1*2
|
|
add r6, r6, #1*16-0*2-128
|
|
bl Col8Swap
|
|
mov r9,#128-2*16+2*2
|
|
add r6, r6, #2*16-1*2-128
|
|
bl Col8Swap
|
|
mov r9,#128-3*16+3*2
|
|
add r6, r6, #3*16-2*2-128
|
|
bl Col8Swap
|
|
sub r6, r6, #6
|
|
b Row4_Loop
|
|
|
|
ALIGN 16
|
|
IDCT_Block4x8 PROC
|
|
|
|
add r0, r0, #128
|
|
stmdb sp!, {r0, r2, r4 - r12, lr} ; r0=BlockEnd r2=DstStride
|
|
sub r6, r0, #128 ;Block
|
|
mov r7, r3 ;Src
|
|
mov r8, r1 ;Dst
|
|
|
|
bl Col8
|
|
add r6, r6, #2
|
|
bl Col8
|
|
add r6, r6, #2
|
|
bl Col8
|
|
add r6, r6, #2
|
|
bl Col8
|
|
sub r6, r6, #6
|
|
|
|
Row4_Loop
|
|
|
|
ldrsh r4, [r6, #4] ;x3
|
|
ldrsh r5, [r6, #6] ;x7
|
|
ldrsh r3, [r6, #2] ;x4
|
|
ldrsh r0, [r6] ;x0
|
|
|
|
orr r11, r5, r4
|
|
orrs r11, r11, r3
|
|
bne Row4_NoConst
|
|
|
|
bl RowConst
|
|
b Row4_Next
|
|
|
|
Row4_NoConst
|
|
cmp r7, #0
|
|
|
|
ldrsh r10, W7
|
|
ldrsh r11, W1
|
|
mov r2, #4
|
|
add r0, r0, #32
|
|
mov r0, r0, lsl #8 ;x0
|
|
mla r14, r3, r10, r2 ;x5 = x4 * W7 + 4
|
|
ldrsh r10, W3
|
|
mla r3, r11, r3, r2 ;x4 = x4 * W1 + 4
|
|
mov r14, r14, asr #3 ;x5 >>= 3
|
|
ldrsh r11, nW5
|
|
mla r12, r5, r10, r2 ;x6 = x7 * W3 + 4
|
|
mov r3, r3, asr #3 ;x4 >>= 3
|
|
ldrsh r10, W6
|
|
mla r5, r11, r5, r2 ;x7 = x7 * -W5 + 4
|
|
ldrsh r11, W2
|
|
add r9, r3, r12, asr #3 ;x1 = x4 + (x6 >> 3)
|
|
sub r3, r3, r12, asr #3 ;x4 = x4 - (x6 >> 3)
|
|
mla r12, r4, r10, r2 ;x2 = x3 * W6 + 4
|
|
mla r4, r11, r4, r2 ;x3 = x3 * W2 + 4
|
|
add r2, r14, r5, asr #3 ;x6 = x5 + (x7 >> 3)
|
|
sub r5, r14, r5, asr #3 ;x5 = x5 - (x7 >> 3)
|
|
add r14, r0, r4, asr #3 ;x7 = x0 + (x3 >> 3)
|
|
sub r4, r0, r4, asr #3 ;x8 = x0 - (x3 >> 3)
|
|
add r10, r0, r12, asr #3;x3 = x0 + (x2 >> 3)
|
|
sub r0, r0, r12, asr #3 ;x0 = x0 - (x2 >> 3)
|
|
add r1, r5, r3
|
|
mov r11, #181
|
|
mul r12, r1, r11 ;x2 = 181 * (x5 + x4)
|
|
sub r3, r3, r5
|
|
mul r1, r3, r11 ;x4 = 181 * (x4 - x5)
|
|
add r12, r12, #128 ;x2 += 128
|
|
add r3, r1, #128 ;x4 += 128
|
|
add r1, r14, r9 ;x5 = x7 + x1
|
|
sub r5, r14, r9 ;x1 = x7 - x1
|
|
add r11, r10, r12, asr #8 ;x7 = x3 + (x2 >> 8)
|
|
sub r14, r10, r12, asr #8 ;x2 = x3 - (x2 >> 8)
|
|
add r9, r0, r3, asr #8 ;x3 = x0 + (x4 >> 8)
|
|
sub r3, r0, r3, asr #8 ;x4 = x0 - (x4 >> 8)
|
|
add r12, r4, r2 ;x0 = x8 + x6
|
|
sub r4, r4, r2 ;x6 = x8 - x6
|
|
|
|
beq Row4_NoSrc
|
|
|
|
ldrb r0, [r7]
|
|
ldrb r2, [r7, #7]
|
|
ldrb r10, [r7, #1]
|
|
add r1, r0, r1, asr #14
|
|
add r5, r2, r5, asr #14
|
|
add r11, r10, r11, asr #14
|
|
ldrb r2, [r7, #6]
|
|
ldrb r0, [r7, #2]
|
|
ldrb r10, [r7, #5]
|
|
add r14, r2, r14, asr #14
|
|
add r9, r0, r9, asr #14
|
|
ldrb r0, [r7, #3]
|
|
ldrb r2, [r7, #4]
|
|
add r3, r10, r3, asr #14
|
|
add r12, r0, r12, asr #14
|
|
add r4, r2, r4, asr #14
|
|
add r7, r7, #8 ;source stride
|
|
|
|
Row4_Sat
|
|
orr r0, r5, r14
|
|
orr r0, r0, r4
|
|
orr r0, r0, r1
|
|
orr r0, r0, r12
|
|
orr r0, r0, r11
|
|
orr r0, r0, r9
|
|
orr r0, r0, r3
|
|
bics r0, r0, #0xFF ; 0xFF = 255
|
|
beq Row4_Write
|
|
|
|
mov r0, #0xFFFFFF00
|
|
|
|
tst r1, r0
|
|
movne r1, #0xFF
|
|
movmi r1, #0x00
|
|
|
|
tst r11, r0
|
|
movne r11, #0xFF
|
|
movmi r11, #0x00
|
|
|
|
tst r9, r0
|
|
movne r9, #0xFF
|
|
movmi r9, #0x00
|
|
|
|
tst r12, r0
|
|
movne r12, #0xFF
|
|
movmi r12, #0x00
|
|
|
|
tst r4, r0
|
|
movne r4, #0xFF
|
|
movmi r4, #0x00
|
|
|
|
tst r3, r0
|
|
movne r3, #0xFF
|
|
movmi r3, #0x00
|
|
|
|
tst r14, r0
|
|
movne r14, #0xFF
|
|
movmi r14, #0x00
|
|
|
|
tst r5, r0
|
|
movne r5, #0xFF
|
|
movmi r5, #0x00
|
|
|
|
Row4_Write
|
|
strb r1, [r8]
|
|
strb r11,[r8, #1]
|
|
strb r9, [r8, #2]
|
|
strb r12,[r8, #3]
|
|
strb r4, [r8, #4]
|
|
strb r3, [r8, #5]
|
|
strb r14,[r8, #6]
|
|
strb r5, [r8, #7]
|
|
|
|
Row4_Next
|
|
ldr r2, [sp, #4] ;DstStride
|
|
ldr r1, [sp, #0] ;BlockEnd
|
|
|
|
add r6,r6,#16 ;Block += 16
|
|
add r8,r8,r2 ;Dst += DstStride
|
|
|
|
cmp r6,r1
|
|
bne Row4_Loop
|
|
|
|
ldmia sp!, {r0,r2,r4 - r12, pc}
|
|
|
|
Row4_NoSrc
|
|
|
|
mov r5, r5, asr #14
|
|
mov r14, r14, asr #14
|
|
mov r12, r12, asr #14
|
|
mov r1, r1, asr #14
|
|
mov r11, r11, asr #14
|
|
mov r9, r9, asr #14
|
|
mov r3, r3, asr #14
|
|
mov r4, r4, asr #14
|
|
|
|
b Row4_Sat
|
|
ENDP
|
|
|
|
; r6 Block
|
|
; r7 Src
|
|
; r8 Dst
|
|
|
|
ALIGN 16
|
|
IDCT_Block8x8Swap PROC
|
|
|
|
add r0, r0, #256
|
|
stmdb sp!, {r0, r2, r4 - r12, lr} ; r0=BlockEnd r2=DstStride
|
|
sub r6, r0, #256 ;Block
|
|
mov r7, r3 ;Src
|
|
mov r8, r1 ;Dst
|
|
|
|
mov r9,#128-0*16+0*2
|
|
bl Col8Swap
|
|
mov r9,#128-1*16+1*2
|
|
add r6, r6, #1*16-0*2-128
|
|
bl Col8Swap
|
|
mov r9,#128-2*16+2*2
|
|
add r6, r6, #2*16-1*2-128
|
|
bl Col8Swap
|
|
mov r9,#128-3*16+3*2
|
|
add r6, r6, #3*16-2*2-128
|
|
bl Col8Swap
|
|
mov r9,#128-4*16+4*2
|
|
add r6, r6, #4*16-3*2-128
|
|
bl Col8Swap
|
|
mov r9,#128-5*16+5*2
|
|
add r6, r6, #5*16-4*2-128
|
|
bl Col8Swap
|
|
mov r9,#128-6*16+6*2
|
|
add r6, r6, #6*16-5*2-128
|
|
bl Col8Swap
|
|
mov r9,#128-7*16+7*2
|
|
add r6, r6, #7*16-6*2-128
|
|
bl Col8Swap
|
|
sub r6, r6, #14
|
|
b Row8_Loop
|
|
|
|
ALIGN 16
|
|
IDCT_Block8x8 PROC
|
|
|
|
add r0, r0, #128
|
|
stmdb sp!, {r0, r2, r4 - r12, lr} ; r0=BlockEnd r2=DstStride
|
|
sub r6, r0, #128 ;Block
|
|
mov r7, r3 ;Src
|
|
mov r8, r1 ;Dst
|
|
|
|
bl Col8
|
|
add r6, r6, #2
|
|
bl Col8
|
|
add r6, r6, #2
|
|
bl Col8
|
|
add r6, r6, #2
|
|
bl Col8
|
|
add r6, r6, #2
|
|
bl Col8
|
|
add r6, r6, #2
|
|
bl Col8
|
|
add r6, r6, #2
|
|
bl Col8
|
|
add r6, r6, #2
|
|
bl Col8
|
|
sub r6, r6, #14
|
|
|
|
Row8_Loop
|
|
|
|
ldrsh r0, [r6] ;x0
|
|
ldrsh r3, [r6, #2] ;x4
|
|
ldrsh r4, [r6, #4] ;x3
|
|
ldrsh r5, [r6, #6] ;x7
|
|
ldrsh r9, [r6, #8] ;x1
|
|
ldrsh r2, [r6, #10] ;x6
|
|
ldrsh r14,[r6, #12] ;x2
|
|
ldrsh r1, [r6, #14] ;x5
|
|
|
|
orr r11, r3, r4
|
|
orr r11, r11, r5
|
|
orr r11, r11, r9
|
|
orr r11, r11, r2
|
|
orr r11, r11, r14
|
|
orrs r11, r11, r1
|
|
bne Row8_NoConst
|
|
|
|
bl RowConst
|
|
b Row8_Next
|
|
|
|
_W3 DCW 2408 ; 2048*sqrt(2)*cos(3*pi/16)
|
|
_W6 DCW 1108 ; 2048*sqrt(2)*cos(6*pi/16)
|
|
_W7 DCW 565 ; 2048*sqrt(2)*cos(7*pi/16)
|
|
|
|
_W1_nW7 DCW 2276
|
|
_nW1_nW7 DCW 0xF2B2 ;-3406
|
|
_W5_nW3 DCW 0xFCE1 ;-799
|
|
_nW2_nW6 DCW 0xF138 ;-3784
|
|
|
|
ALIGN 4
|
|
|
|
Row8_NoConst
|
|
cmp r7, #0
|
|
|
|
add r0, r0, #32
|
|
ldrsh r10, _W7
|
|
mov r0, r0, lsl #11 ;x0 = (x0 + 32) << 11
|
|
ldrsh r12, _W1_nW7
|
|
add r11,r3,r1
|
|
mul r11,r10,r11 ;x8 = W7 * (x4 + x5)
|
|
ldrsh r10, _nW1_nW7
|
|
mla r3, r12, r3, r11 ;x4 = x8 + (W1-W7) * x4
|
|
ldrsh r12, _W3
|
|
mla r1, r10, r1, r11 ;x5 = x8 + (-W1-W7) * x5
|
|
ldrsh r10, _W5_nW3
|
|
add r11,r2,r5 ;x6 + x7
|
|
mul r11,r12,r11 ;x8 = W3 * (x6 + x7)
|
|
mvn r12, #0xFB0 ;-W3-W5
|
|
mla r2,r10,r2,r11 ;x6 = x8 + (W5-W3) * x6
|
|
ldrsh r10, _W6
|
|
mla r5,r12,r5,r11 ;x7 = x8 + (-W3-W5) * x7
|
|
ldrsh r12, _nW2_nW6
|
|
add r11, r0, r9, lsl #11;x8 = x0 + (x1 << 11)
|
|
sub r0, r0, r9, lsl #11 ;x0 = x0 - (x1 << 11)
|
|
add r9, r4, r14
|
|
mul r9, r10, r9 ;x1 = W6 * (x3 + x2)
|
|
mov r10, #0x620 ;W2-W6
|
|
mla r14, r12, r14, r9 ;x2 = x1 + (-W2-W6) * x2
|
|
mov r12, #181
|
|
mla r4, r10, r4, r9 ;x3 = x1 + (W2-W6) * x3
|
|
add r9, r3, r2 ;x1 = x4 + x6
|
|
sub r3, r3, r2 ;x4 = x4 - x6
|
|
add r2, r1, r5 ;x6 = x5 + x7
|
|
sub r1, r1, r5 ;x5 = x5 - x7
|
|
add r5, r11, r4 ;x7 = x8 + x3
|
|
sub r11, r11, r4 ;x8 = x8 - x3
|
|
add r4, r0, r14 ;x3 = x0 + x2
|
|
sub r0, r0, r14 ;x0 = x0 - x2
|
|
add r3, r3, #4 ;
|
|
add r14, r3, r1 ;x2 = x4 + x5 + 4
|
|
sub r3, r3, r1 ;x4 = x4 - x5 + 4
|
|
mov r10, #16
|
|
mov r14, r14, asr #3
|
|
mov r3, r3, asr #3
|
|
mla r14, r12, r14, r10 ;x2 = 181 * ((x4 + x5 + 4) >> 3) + 16
|
|
mla r3, r12, r3, r10 ;x4 = 181 * ((x4 - x5 + 4) >> 3) + 16
|
|
|
|
add r1, r5, r9 ;x5 = x7 + x1
|
|
sub r9, r5, r9 ;x1 = x7 - x1
|
|
add r5, r4, r14, asr #5 ;x7 = x3 + (x2 >> 5)
|
|
sub r14,r4, r14, asr #5 ;x2 = x3 - (x2 >> 5)
|
|
add r4, r0, r3, asr #5 ;x3 = x0 + (x4 >> 5)
|
|
sub r3, r0, r3, asr #5 ;x4 = x0 - (x4 >> 5)
|
|
add r0, r11, r2 ;x0 = x8 + x6
|
|
sub r2, r11, r2 ;x6 = x8 - x6
|
|
|
|
beq Row8_NoSrc
|
|
|
|
ldrb r10, [r7]
|
|
ldrb r12, [r7, #7]
|
|
ldrb r11, [r7, #1]
|
|
add r1, r10, r1, asr #17
|
|
add r9, r12, r9, asr #17
|
|
add r5, r11, r5, asr #17
|
|
ldrb r10, [r7, #6]
|
|
ldrb r12, [r7, #2]
|
|
ldrb r11, [r7, #5]
|
|
add r14, r10, r14, asr #17
|
|
add r4, r12, r4, asr #17
|
|
ldrb r10, [r7, #3]
|
|
ldrb r12, [r7, #4]
|
|
add r3, r11, r3, asr #17
|
|
add r0, r10, r0, asr #17
|
|
add r2, r12, r2, asr #17
|
|
add r7, r7, #8 ;source stride
|
|
|
|
Row8_Sat
|
|
orr r10, r1, r9
|
|
orr r10, r10, r5
|
|
orr r10, r10, r14
|
|
orr r10, r10, r4
|
|
orr r10, r10, r3
|
|
orr r10, r10, r0
|
|
orr r10, r10, r2
|
|
bics r10, r10, #0xFF ; 0xFF = 255
|
|
beq Row8_Write
|
|
|
|
mov r10, #0xFFFFFF00
|
|
|
|
tst r1, r10
|
|
movne r1, #0xFF
|
|
movmi r1, #0x00
|
|
|
|
tst r9, r10
|
|
movne r9, #0xFF
|
|
movmi r9, #0x00
|
|
|
|
tst r5, r10
|
|
movne r5, #0xFF
|
|
movmi r5, #0x00
|
|
|
|
tst r14, r10
|
|
movne r14, #0xFF
|
|
movmi r14, #0x00
|
|
|
|
tst r4, r10
|
|
movne r4, #0xFF
|
|
movmi r4, #0x00
|
|
|
|
tst r3, r10
|
|
movne r3, #0xFF
|
|
movmi r3, #0x00
|
|
|
|
tst r0, r10
|
|
movne r0, #0xFF
|
|
movmi r0, #0x00
|
|
|
|
tst r2, r10
|
|
movne r2, #0xFF
|
|
movmi r2, #0x00
|
|
|
|
Row8_Write
|
|
strb r1, [r8]
|
|
strb r5, [r8, #1]
|
|
strb r4, [r8, #2]
|
|
strb r0, [r8, #3]
|
|
strb r2, [r8, #4]
|
|
strb r3, [r8, #5]
|
|
strb r14,[r8, #6]
|
|
strb r9, [r8, #7]
|
|
|
|
Row8_Next
|
|
ldr r2, [sp, #4] ;DstStride
|
|
ldr r1, [sp, #0] ;BlockEnd
|
|
|
|
add r6,r6,#16 ;Block += 16
|
|
add r8,r8,r2 ;Dst += DstStride
|
|
|
|
cmp r6,r1
|
|
bne Row8_Loop
|
|
|
|
ldmia sp!, {r0,r2,r4 - r12, pc}
|
|
|
|
Row8_NoSrc
|
|
|
|
mov r1, r1, asr #17
|
|
mov r9, r9, asr #17
|
|
mov r5, r5, asr #17
|
|
mov r14, r14, asr #17
|
|
mov r4, r4, asr #17
|
|
mov r3, r3, asr #17
|
|
mov r0, r0, asr #17
|
|
mov r2, r2, asr #17
|
|
|
|
b Row8_Sat
|
|
ENDP
|
|
|
|
END
|