mirror of https://github.com/F-Stack/f-stack.git
3826 lines
79 KiB
ArmAsm
3826 lines
79 KiB
ArmAsm
|
/* $FreeBSD$ */
|
||
|
/* Do not modify. This file is auto-generated from poly1305-x86.pl. */
|
||
|
#ifdef PIC
|
||
|
.text
|
||
|
.align 64
|
||
|
.globl poly1305_init
|
||
|
.type poly1305_init,@function
|
||
|
.align 16
|
||
|
poly1305_init:
|
||
|
.L_poly1305_init_begin:
|
||
|
pushl %ebp
|
||
|
pushl %ebx
|
||
|
pushl %esi
|
||
|
pushl %edi
|
||
|
movl 20(%esp),%edi
|
||
|
movl 24(%esp),%esi
|
||
|
movl 28(%esp),%ebp
|
||
|
xorl %eax,%eax
|
||
|
movl %eax,(%edi)
|
||
|
movl %eax,4(%edi)
|
||
|
movl %eax,8(%edi)
|
||
|
movl %eax,12(%edi)
|
||
|
movl %eax,16(%edi)
|
||
|
movl %eax,20(%edi)
|
||
|
cmpl $0,%esi
|
||
|
je .L000nokey
|
||
|
call .L001pic_point
|
||
|
.L001pic_point:
|
||
|
popl %ebx
|
||
|
leal poly1305_blocks-.L001pic_point(%ebx),%eax
|
||
|
leal poly1305_emit-.L001pic_point(%ebx),%edx
|
||
|
leal OPENSSL_ia32cap_P-.L001pic_point(%ebx),%edi
|
||
|
movl (%edi),%ecx
|
||
|
andl $83886080,%ecx
|
||
|
cmpl $83886080,%ecx
|
||
|
jne .L002no_sse2
|
||
|
leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
|
||
|
leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx
|
||
|
movl 8(%edi),%ecx
|
||
|
testl $32,%ecx
|
||
|
jz .L002no_sse2
|
||
|
leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
|
||
|
.L002no_sse2:
|
||
|
movl 20(%esp),%edi
|
||
|
movl %eax,(%ebp)
|
||
|
movl %edx,4(%ebp)
|
||
|
movl (%esi),%eax
|
||
|
movl 4(%esi),%ebx
|
||
|
movl 8(%esi),%ecx
|
||
|
movl 12(%esi),%edx
|
||
|
andl $268435455,%eax
|
||
|
andl $268435452,%ebx
|
||
|
andl $268435452,%ecx
|
||
|
andl $268435452,%edx
|
||
|
movl %eax,24(%edi)
|
||
|
movl %ebx,28(%edi)
|
||
|
movl %ecx,32(%edi)
|
||
|
movl %edx,36(%edi)
|
||
|
movl $1,%eax
|
||
|
.L000nokey:
|
||
|
popl %edi
|
||
|
popl %esi
|
||
|
popl %ebx
|
||
|
popl %ebp
|
||
|
ret
|
||
|
.size poly1305_init,.-.L_poly1305_init_begin
|
||
|
.globl poly1305_blocks
|
||
|
.type poly1305_blocks,@function
|
||
|
.align 16
|
||
|
poly1305_blocks:
|
||
|
.L_poly1305_blocks_begin:
|
||
|
pushl %ebp
|
||
|
pushl %ebx
|
||
|
pushl %esi
|
||
|
pushl %edi
|
||
|
movl 20(%esp),%edi
|
||
|
movl 24(%esp),%esi
|
||
|
movl 28(%esp),%ecx
|
||
|
.Lenter_blocks:
|
||
|
andl $-15,%ecx
|
||
|
jz .L003nodata
|
||
|
subl $64,%esp
|
||
|
movl 24(%edi),%eax
|
||
|
movl 28(%edi),%ebx
|
||
|
leal (%esi,%ecx,1),%ebp
|
||
|
movl 32(%edi),%ecx
|
||
|
movl 36(%edi),%edx
|
||
|
movl %ebp,92(%esp)
|
||
|
movl %esi,%ebp
|
||
|
movl %eax,36(%esp)
|
||
|
movl %ebx,%eax
|
||
|
shrl $2,%eax
|
||
|
movl %ebx,40(%esp)
|
||
|
addl %ebx,%eax
|
||
|
movl %ecx,%ebx
|
||
|
shrl $2,%ebx
|
||
|
movl %ecx,44(%esp)
|
||
|
addl %ecx,%ebx
|
||
|
movl %edx,%ecx
|
||
|
shrl $2,%ecx
|
||
|
movl %edx,48(%esp)
|
||
|
addl %edx,%ecx
|
||
|
movl %eax,52(%esp)
|
||
|
movl %ebx,56(%esp)
|
||
|
movl %ecx,60(%esp)
|
||
|
movl (%edi),%eax
|
||
|
movl 4(%edi),%ebx
|
||
|
movl 8(%edi),%ecx
|
||
|
movl 12(%edi),%esi
|
||
|
movl 16(%edi),%edi
|
||
|
jmp .L004loop
|
||
|
.align 32
|
||
|
.L004loop:
|
||
|
addl (%ebp),%eax
|
||
|
adcl 4(%ebp),%ebx
|
||
|
adcl 8(%ebp),%ecx
|
||
|
adcl 12(%ebp),%esi
|
||
|
leal 16(%ebp),%ebp
|
||
|
adcl 96(%esp),%edi
|
||
|
movl %eax,(%esp)
|
||
|
movl %esi,12(%esp)
|
||
|
mull 36(%esp)
|
||
|
movl %edi,16(%esp)
|
||
|
movl %eax,%edi
|
||
|
movl %ebx,%eax
|
||
|
movl %edx,%esi
|
||
|
mull 60(%esp)
|
||
|
addl %eax,%edi
|
||
|
movl %ecx,%eax
|
||
|
adcl %edx,%esi
|
||
|
mull 56(%esp)
|
||
|
addl %eax,%edi
|
||
|
movl 12(%esp),%eax
|
||
|
adcl %edx,%esi
|
||
|
mull 52(%esp)
|
||
|
addl %eax,%edi
|
||
|
movl (%esp),%eax
|
||
|
adcl %edx,%esi
|
||
|
mull 40(%esp)
|
||
|
movl %edi,20(%esp)
|
||
|
xorl %edi,%edi
|
||
|
addl %eax,%esi
|
||
|
movl %ebx,%eax
|
||
|
adcl %edx,%edi
|
||
|
mull 36(%esp)
|
||
|
addl %eax,%esi
|
||
|
movl %ecx,%eax
|
||
|
adcl %edx,%edi
|
||
|
mull 60(%esp)
|
||
|
addl %eax,%esi
|
||
|
movl 12(%esp),%eax
|
||
|
adcl %edx,%edi
|
||
|
mull 56(%esp)
|
||
|
addl %eax,%esi
|
||
|
movl 16(%esp),%eax
|
||
|
adcl %edx,%edi
|
||
|
imull 52(%esp),%eax
|
||
|
addl %eax,%esi
|
||
|
movl (%esp),%eax
|
||
|
adcl $0,%edi
|
||
|
mull 44(%esp)
|
||
|
movl %esi,24(%esp)
|
||
|
xorl %esi,%esi
|
||
|
addl %eax,%edi
|
||
|
movl %ebx,%eax
|
||
|
adcl %edx,%esi
|
||
|
mull 40(%esp)
|
||
|
addl %eax,%edi
|
||
|
movl %ecx,%eax
|
||
|
adcl %edx,%esi
|
||
|
mull 36(%esp)
|
||
|
addl %eax,%edi
|
||
|
movl 12(%esp),%eax
|
||
|
adcl %edx,%esi
|
||
|
mull 60(%esp)
|
||
|
addl %eax,%edi
|
||
|
movl 16(%esp),%eax
|
||
|
adcl %edx,%esi
|
||
|
imull 56(%esp),%eax
|
||
|
addl %eax,%edi
|
||
|
movl (%esp),%eax
|
||
|
adcl $0,%esi
|
||
|
mull 48(%esp)
|
||
|
movl %edi,28(%esp)
|
||
|
xorl %edi,%edi
|
||
|
addl %eax,%esi
|
||
|
movl %ebx,%eax
|
||
|
adcl %edx,%edi
|
||
|
mull 44(%esp)
|
||
|
addl %eax,%esi
|
||
|
movl %ecx,%eax
|
||
|
adcl %edx,%edi
|
||
|
mull 40(%esp)
|
||
|
addl %eax,%esi
|
||
|
movl 12(%esp),%eax
|
||
|
adcl %edx,%edi
|
||
|
mull 36(%esp)
|
||
|
addl %eax,%esi
|
||
|
movl 16(%esp),%ecx
|
||
|
adcl %edx,%edi
|
||
|
movl %ecx,%edx
|
||
|
imull 60(%esp),%ecx
|
||
|
addl %ecx,%esi
|
||
|
movl 20(%esp),%eax
|
||
|
adcl $0,%edi
|
||
|
imull 36(%esp),%edx
|
||
|
addl %edi,%edx
|
||
|
movl 24(%esp),%ebx
|
||
|
movl 28(%esp),%ecx
|
||
|
movl %edx,%edi
|
||
|
shrl $2,%edx
|
||
|
andl $3,%edi
|
||
|
leal (%edx,%edx,4),%edx
|
||
|
addl %edx,%eax
|
||
|
adcl $0,%ebx
|
||
|
adcl $0,%ecx
|
||
|
adcl $0,%esi
|
||
|
adcl $0,%edi
|
||
|
cmpl 92(%esp),%ebp
|
||
|
jne .L004loop
|
||
|
movl 84(%esp),%edx
|
||
|
addl $64,%esp
|
||
|
movl %eax,(%edx)
|
||
|
movl %ebx,4(%edx)
|
||
|
movl %ecx,8(%edx)
|
||
|
movl %esi,12(%edx)
|
||
|
movl %edi,16(%edx)
|
||
|
.L003nodata:
|
||
|
popl %edi
|
||
|
popl %esi
|
||
|
popl %ebx
|
||
|
popl %ebp
|
||
|
ret
|
||
|
.size poly1305_blocks,.-.L_poly1305_blocks_begin
|
||
|
.globl poly1305_emit
|
||
|
.type poly1305_emit,@function
|
||
|
.align 16
|
||
|
poly1305_emit:
|
||
|
.L_poly1305_emit_begin:
|
||
|
pushl %ebp
|
||
|
pushl %ebx
|
||
|
pushl %esi
|
||
|
pushl %edi
|
||
|
movl 20(%esp),%ebp
|
||
|
.Lenter_emit:
|
||
|
movl 24(%esp),%edi
|
||
|
movl (%ebp),%eax
|
||
|
movl 4(%ebp),%ebx
|
||
|
movl 8(%ebp),%ecx
|
||
|
movl 12(%ebp),%edx
|
||
|
movl 16(%ebp),%esi
|
||
|
addl $5,%eax
|
||
|
adcl $0,%ebx
|
||
|
adcl $0,%ecx
|
||
|
adcl $0,%edx
|
||
|
adcl $0,%esi
|
||
|
shrl $2,%esi
|
||
|
negl %esi
|
||
|
andl %esi,%eax
|
||
|
andl %esi,%ebx
|
||
|
andl %esi,%ecx
|
||
|
andl %esi,%edx
|
||
|
movl %eax,(%edi)
|
||
|
movl %ebx,4(%edi)
|
||
|
movl %ecx,8(%edi)
|
||
|
movl %edx,12(%edi)
|
||
|
notl %esi
|
||
|
movl (%ebp),%eax
|
||
|
movl 4(%ebp),%ebx
|
||
|
movl 8(%ebp),%ecx
|
||
|
movl 12(%ebp),%edx
|
||
|
movl 28(%esp),%ebp
|
||
|
andl %esi,%eax
|
||
|
andl %esi,%ebx
|
||
|
andl %esi,%ecx
|
||
|
andl %esi,%edx
|
||
|
orl (%edi),%eax
|
||
|
orl 4(%edi),%ebx
|
||
|
orl 8(%edi),%ecx
|
||
|
orl 12(%edi),%edx
|
||
|
addl (%ebp),%eax
|
||
|
adcl 4(%ebp),%ebx
|
||
|
adcl 8(%ebp),%ecx
|
||
|
adcl 12(%ebp),%edx
|
||
|
movl %eax,(%edi)
|
||
|
movl %ebx,4(%edi)
|
||
|
movl %ecx,8(%edi)
|
||
|
movl %edx,12(%edi)
|
||
|
popl %edi
|
||
|
popl %esi
|
||
|
popl %ebx
|
||
|
popl %ebp
|
||
|
ret
|
||
|
.size poly1305_emit,.-.L_poly1305_emit_begin
|
||
|
.align 32
|
||
|
.type _poly1305_init_sse2,@function
|
||
|
.align 16
|
||
|
_poly1305_init_sse2:
|
||
|
movdqu 24(%edi),%xmm4
|
||
|
leal 48(%edi),%edi
|
||
|
movl %esp,%ebp
|
||
|
subl $224,%esp
|
||
|
andl $-16,%esp
|
||
|
movq 64(%ebx),%xmm7
|
||
|
movdqa %xmm4,%xmm0
|
||
|
movdqa %xmm4,%xmm1
|
||
|
movdqa %xmm4,%xmm2
|
||
|
pand %xmm7,%xmm0
|
||
|
psrlq $26,%xmm1
|
||
|
psrldq $6,%xmm2
|
||
|
pand %xmm7,%xmm1
|
||
|
movdqa %xmm2,%xmm3
|
||
|
psrlq $4,%xmm2
|
||
|
psrlq $30,%xmm3
|
||
|
pand %xmm7,%xmm2
|
||
|
pand %xmm7,%xmm3
|
||
|
psrldq $13,%xmm4
|
||
|
leal 144(%esp),%edx
|
||
|
movl $2,%ecx
|
||
|
.L005square:
|
||
|
movdqa %xmm0,(%esp)
|
||
|
movdqa %xmm1,16(%esp)
|
||
|
movdqa %xmm2,32(%esp)
|
||
|
movdqa %xmm3,48(%esp)
|
||
|
movdqa %xmm4,64(%esp)
|
||
|
movdqa %xmm1,%xmm6
|
||
|
movdqa %xmm2,%xmm5
|
||
|
pslld $2,%xmm6
|
||
|
pslld $2,%xmm5
|
||
|
paddd %xmm1,%xmm6
|
||
|
paddd %xmm2,%xmm5
|
||
|
movdqa %xmm6,80(%esp)
|
||
|
movdqa %xmm5,96(%esp)
|
||
|
movdqa %xmm3,%xmm6
|
||
|
movdqa %xmm4,%xmm5
|
||
|
pslld $2,%xmm6
|
||
|
pslld $2,%xmm5
|
||
|
paddd %xmm3,%xmm6
|
||
|
paddd %xmm4,%xmm5
|
||
|
movdqa %xmm6,112(%esp)
|
||
|
movdqa %xmm5,128(%esp)
|
||
|
pshufd $68,%xmm0,%xmm6
|
||
|
movdqa %xmm1,%xmm5
|
||
|
pshufd $68,%xmm1,%xmm1
|
||
|
pshufd $68,%xmm2,%xmm2
|
||
|
pshufd $68,%xmm3,%xmm3
|
||
|
pshufd $68,%xmm4,%xmm4
|
||
|
movdqa %xmm6,(%edx)
|
||
|
movdqa %xmm1,16(%edx)
|
||
|
movdqa %xmm2,32(%edx)
|
||
|
movdqa %xmm3,48(%edx)
|
||
|
movdqa %xmm4,64(%edx)
|
||
|
pmuludq %xmm0,%xmm4
|
||
|
pmuludq %xmm0,%xmm3
|
||
|
pmuludq %xmm0,%xmm2
|
||
|
pmuludq %xmm0,%xmm1
|
||
|
pmuludq %xmm6,%xmm0
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 48(%edx),%xmm5
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 32(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 16(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movdqa 80(%esp),%xmm6
|
||
|
pmuludq (%edx),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
pmuludq 64(%edx),%xmm6
|
||
|
movdqa 32(%esp),%xmm7
|
||
|
paddq %xmm5,%xmm1
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 32(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm0
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 16(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm4
|
||
|
movdqa 96(%esp),%xmm7
|
||
|
pmuludq (%edx),%xmm6
|
||
|
paddq %xmm5,%xmm3
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 64(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm2
|
||
|
pmuludq 48(%edx),%xmm5
|
||
|
movdqa 48(%esp),%xmm6
|
||
|
paddq %xmm7,%xmm1
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 16(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm0
|
||
|
movdqa 112(%esp),%xmm5
|
||
|
pmuludq (%edx),%xmm7
|
||
|
paddq %xmm6,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 64(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm3
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 48(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm2
|
||
|
pmuludq 32(%edx),%xmm7
|
||
|
movdqa 64(%esp),%xmm5
|
||
|
paddq %xmm6,%xmm1
|
||
|
movdqa 128(%esp),%xmm6
|
||
|
pmuludq (%edx),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 64(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 16(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 32(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
pmuludq 48(%edx),%xmm6
|
||
|
movdqa 64(%ebx),%xmm7
|
||
|
paddq %xmm5,%xmm1
|
||
|
paddq %xmm6,%xmm2
|
||
|
movdqa %xmm3,%xmm5
|
||
|
pand %xmm7,%xmm3
|
||
|
psrlq $26,%xmm5
|
||
|
paddq %xmm4,%xmm5
|
||
|
movdqa %xmm0,%xmm6
|
||
|
pand %xmm7,%xmm0
|
||
|
psrlq $26,%xmm6
|
||
|
movdqa %xmm5,%xmm4
|
||
|
paddq %xmm1,%xmm6
|
||
|
psrlq $26,%xmm5
|
||
|
pand %xmm7,%xmm4
|
||
|
movdqa %xmm6,%xmm1
|
||
|
psrlq $26,%xmm6
|
||
|
paddd %xmm5,%xmm0
|
||
|
psllq $2,%xmm5
|
||
|
paddq %xmm2,%xmm6
|
||
|
paddq %xmm0,%xmm5
|
||
|
pand %xmm7,%xmm1
|
||
|
movdqa %xmm6,%xmm2
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm2
|
||
|
paddd %xmm3,%xmm6
|
||
|
movdqa %xmm5,%xmm0
|
||
|
psrlq $26,%xmm5
|
||
|
movdqa %xmm6,%xmm3
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm0
|
||
|
paddd %xmm5,%xmm1
|
||
|
pand %xmm7,%xmm3
|
||
|
paddd %xmm6,%xmm4
|
||
|
decl %ecx
|
||
|
jz .L006square_break
|
||
|
punpcklqdq (%esp),%xmm0
|
||
|
punpcklqdq 16(%esp),%xmm1
|
||
|
punpcklqdq 32(%esp),%xmm2
|
||
|
punpcklqdq 48(%esp),%xmm3
|
||
|
punpcklqdq 64(%esp),%xmm4
|
||
|
jmp .L005square
|
||
|
.L006square_break:
|
||
|
psllq $32,%xmm0
|
||
|
psllq $32,%xmm1
|
||
|
psllq $32,%xmm2
|
||
|
psllq $32,%xmm3
|
||
|
psllq $32,%xmm4
|
||
|
por (%esp),%xmm0
|
||
|
por 16(%esp),%xmm1
|
||
|
por 32(%esp),%xmm2
|
||
|
por 48(%esp),%xmm3
|
||
|
por 64(%esp),%xmm4
|
||
|
pshufd $141,%xmm0,%xmm0
|
||
|
pshufd $141,%xmm1,%xmm1
|
||
|
pshufd $141,%xmm2,%xmm2
|
||
|
pshufd $141,%xmm3,%xmm3
|
||
|
pshufd $141,%xmm4,%xmm4
|
||
|
movdqu %xmm0,(%edi)
|
||
|
movdqu %xmm1,16(%edi)
|
||
|
movdqu %xmm2,32(%edi)
|
||
|
movdqu %xmm3,48(%edi)
|
||
|
movdqu %xmm4,64(%edi)
|
||
|
movdqa %xmm1,%xmm6
|
||
|
movdqa %xmm2,%xmm5
|
||
|
pslld $2,%xmm6
|
||
|
pslld $2,%xmm5
|
||
|
paddd %xmm1,%xmm6
|
||
|
paddd %xmm2,%xmm5
|
||
|
movdqu %xmm6,80(%edi)
|
||
|
movdqu %xmm5,96(%edi)
|
||
|
movdqa %xmm3,%xmm6
|
||
|
movdqa %xmm4,%xmm5
|
||
|
pslld $2,%xmm6
|
||
|
pslld $2,%xmm5
|
||
|
paddd %xmm3,%xmm6
|
||
|
paddd %xmm4,%xmm5
|
||
|
movdqu %xmm6,112(%edi)
|
||
|
movdqu %xmm5,128(%edi)
|
||
|
movl %ebp,%esp
|
||
|
leal -48(%edi),%edi
|
||
|
ret
|
||
|
.size _poly1305_init_sse2,.-_poly1305_init_sse2
|
||
|
.align 32
|
||
|
.type _poly1305_blocks_sse2,@function
|
||
|
.align 16
|
||
|
_poly1305_blocks_sse2:
|
||
|
pushl %ebp
|
||
|
pushl %ebx
|
||
|
pushl %esi
|
||
|
pushl %edi
|
||
|
movl 20(%esp),%edi
|
||
|
movl 24(%esp),%esi
|
||
|
movl 28(%esp),%ecx
|
||
|
movl 20(%edi),%eax
|
||
|
andl $-16,%ecx
|
||
|
jz .L007nodata
|
||
|
cmpl $64,%ecx
|
||
|
jae .L008enter_sse2
|
||
|
testl %eax,%eax
|
||
|
jz .Lenter_blocks
|
||
|
.align 16
|
||
|
.L008enter_sse2:
|
||
|
call .L009pic_point
|
||
|
.L009pic_point:
|
||
|
popl %ebx
|
||
|
leal .Lconst_sse2-.L009pic_point(%ebx),%ebx
|
||
|
testl %eax,%eax
|
||
|
jnz .L010base2_26
|
||
|
call _poly1305_init_sse2
|
||
|
movl (%edi),%eax
|
||
|
movl 3(%edi),%ecx
|
||
|
movl 6(%edi),%edx
|
||
|
movl 9(%edi),%esi
|
||
|
movl 13(%edi),%ebp
|
||
|
movl $1,20(%edi)
|
||
|
shrl $2,%ecx
|
||
|
andl $67108863,%eax
|
||
|
shrl $4,%edx
|
||
|
andl $67108863,%ecx
|
||
|
shrl $6,%esi
|
||
|
andl $67108863,%edx
|
||
|
movd %eax,%xmm0
|
||
|
movd %ecx,%xmm1
|
||
|
movd %edx,%xmm2
|
||
|
movd %esi,%xmm3
|
||
|
movd %ebp,%xmm4
|
||
|
movl 24(%esp),%esi
|
||
|
movl 28(%esp),%ecx
|
||
|
jmp .L011base2_32
|
||
|
.align 16
|
||
|
.L010base2_26:
|
||
|
movd (%edi),%xmm0
|
||
|
movd 4(%edi),%xmm1
|
||
|
movd 8(%edi),%xmm2
|
||
|
movd 12(%edi),%xmm3
|
||
|
movd 16(%edi),%xmm4
|
||
|
movdqa 64(%ebx),%xmm7
|
||
|
.L011base2_32:
|
||
|
movl 32(%esp),%eax
|
||
|
movl %esp,%ebp
|
||
|
subl $528,%esp
|
||
|
andl $-16,%esp
|
||
|
leal 48(%edi),%edi
|
||
|
shll $24,%eax
|
||
|
testl $31,%ecx
|
||
|
jz .L012even
|
||
|
movdqu (%esi),%xmm6
|
||
|
leal 16(%esi),%esi
|
||
|
movdqa %xmm6,%xmm5
|
||
|
pand %xmm7,%xmm6
|
||
|
paddd %xmm6,%xmm0
|
||
|
movdqa %xmm5,%xmm6
|
||
|
psrlq $26,%xmm5
|
||
|
psrldq $6,%xmm6
|
||
|
pand %xmm7,%xmm5
|
||
|
paddd %xmm5,%xmm1
|
||
|
movdqa %xmm6,%xmm5
|
||
|
psrlq $4,%xmm6
|
||
|
pand %xmm7,%xmm6
|
||
|
paddd %xmm6,%xmm2
|
||
|
movdqa %xmm5,%xmm6
|
||
|
psrlq $30,%xmm5
|
||
|
pand %xmm7,%xmm5
|
||
|
psrldq $7,%xmm6
|
||
|
paddd %xmm5,%xmm3
|
||
|
movd %eax,%xmm5
|
||
|
paddd %xmm6,%xmm4
|
||
|
movd 12(%edi),%xmm6
|
||
|
paddd %xmm5,%xmm4
|
||
|
movdqa %xmm0,(%esp)
|
||
|
movdqa %xmm1,16(%esp)
|
||
|
movdqa %xmm2,32(%esp)
|
||
|
movdqa %xmm3,48(%esp)
|
||
|
movdqa %xmm4,64(%esp)
|
||
|
pmuludq %xmm6,%xmm0
|
||
|
pmuludq %xmm6,%xmm1
|
||
|
pmuludq %xmm6,%xmm2
|
||
|
movd 28(%edi),%xmm5
|
||
|
pmuludq %xmm6,%xmm3
|
||
|
pmuludq %xmm6,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 48(%esp),%xmm5
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 32(%esp),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 16(%esp),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movd 92(%edi),%xmm6
|
||
|
pmuludq (%esp),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
pmuludq 64(%esp),%xmm6
|
||
|
movd 44(%edi),%xmm7
|
||
|
paddq %xmm5,%xmm1
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 32(%esp),%xmm7
|
||
|
paddq %xmm6,%xmm0
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 16(%esp),%xmm5
|
||
|
paddq %xmm7,%xmm4
|
||
|
movd 108(%edi),%xmm7
|
||
|
pmuludq (%esp),%xmm6
|
||
|
paddq %xmm5,%xmm3
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 64(%esp),%xmm7
|
||
|
paddq %xmm6,%xmm2
|
||
|
pmuludq 48(%esp),%xmm5
|
||
|
movd 60(%edi),%xmm6
|
||
|
paddq %xmm7,%xmm1
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 16(%esp),%xmm6
|
||
|
paddq %xmm5,%xmm0
|
||
|
movd 124(%edi),%xmm5
|
||
|
pmuludq (%esp),%xmm7
|
||
|
paddq %xmm6,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 64(%esp),%xmm5
|
||
|
paddq %xmm7,%xmm3
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 48(%esp),%xmm6
|
||
|
paddq %xmm5,%xmm2
|
||
|
pmuludq 32(%esp),%xmm7
|
||
|
movd 76(%edi),%xmm5
|
||
|
paddq %xmm6,%xmm1
|
||
|
movd 140(%edi),%xmm6
|
||
|
pmuludq (%esp),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 64(%esp),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 16(%esp),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 32(%esp),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
pmuludq 48(%esp),%xmm6
|
||
|
movdqa 64(%ebx),%xmm7
|
||
|
paddq %xmm5,%xmm1
|
||
|
paddq %xmm6,%xmm2
|
||
|
movdqa %xmm3,%xmm5
|
||
|
pand %xmm7,%xmm3
|
||
|
psrlq $26,%xmm5
|
||
|
paddq %xmm4,%xmm5
|
||
|
movdqa %xmm0,%xmm6
|
||
|
pand %xmm7,%xmm0
|
||
|
psrlq $26,%xmm6
|
||
|
movdqa %xmm5,%xmm4
|
||
|
paddq %xmm1,%xmm6
|
||
|
psrlq $26,%xmm5
|
||
|
pand %xmm7,%xmm4
|
||
|
movdqa %xmm6,%xmm1
|
||
|
psrlq $26,%xmm6
|
||
|
paddd %xmm5,%xmm0
|
||
|
psllq $2,%xmm5
|
||
|
paddq %xmm2,%xmm6
|
||
|
paddq %xmm0,%xmm5
|
||
|
pand %xmm7,%xmm1
|
||
|
movdqa %xmm6,%xmm2
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm2
|
||
|
paddd %xmm3,%xmm6
|
||
|
movdqa %xmm5,%xmm0
|
||
|
psrlq $26,%xmm5
|
||
|
movdqa %xmm6,%xmm3
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm0
|
||
|
paddd %xmm5,%xmm1
|
||
|
pand %xmm7,%xmm3
|
||
|
paddd %xmm6,%xmm4
|
||
|
subl $16,%ecx
|
||
|
jz .L013done
|
||
|
.L012even:
|
||
|
leal 384(%esp),%edx
|
||
|
leal -32(%esi),%eax
|
||
|
subl $64,%ecx
|
||
|
movdqu (%edi),%xmm5
|
||
|
pshufd $68,%xmm5,%xmm6
|
||
|
cmovbl %eax,%esi
|
||
|
pshufd $238,%xmm5,%xmm5
|
||
|
movdqa %xmm6,(%edx)
|
||
|
leal 160(%esp),%eax
|
||
|
movdqu 16(%edi),%xmm6
|
||
|
movdqa %xmm5,-144(%edx)
|
||
|
pshufd $68,%xmm6,%xmm5
|
||
|
pshufd $238,%xmm6,%xmm6
|
||
|
movdqa %xmm5,16(%edx)
|
||
|
movdqu 32(%edi),%xmm5
|
||
|
movdqa %xmm6,-128(%edx)
|
||
|
pshufd $68,%xmm5,%xmm6
|
||
|
pshufd $238,%xmm5,%xmm5
|
||
|
movdqa %xmm6,32(%edx)
|
||
|
movdqu 48(%edi),%xmm6
|
||
|
movdqa %xmm5,-112(%edx)
|
||
|
pshufd $68,%xmm6,%xmm5
|
||
|
pshufd $238,%xmm6,%xmm6
|
||
|
movdqa %xmm5,48(%edx)
|
||
|
movdqu 64(%edi),%xmm5
|
||
|
movdqa %xmm6,-96(%edx)
|
||
|
pshufd $68,%xmm5,%xmm6
|
||
|
pshufd $238,%xmm5,%xmm5
|
||
|
movdqa %xmm6,64(%edx)
|
||
|
movdqu 80(%edi),%xmm6
|
||
|
movdqa %xmm5,-80(%edx)
|
||
|
pshufd $68,%xmm6,%xmm5
|
||
|
pshufd $238,%xmm6,%xmm6
|
||
|
movdqa %xmm5,80(%edx)
|
||
|
movdqu 96(%edi),%xmm5
|
||
|
movdqa %xmm6,-64(%edx)
|
||
|
pshufd $68,%xmm5,%xmm6
|
||
|
pshufd $238,%xmm5,%xmm5
|
||
|
movdqa %xmm6,96(%edx)
|
||
|
movdqu 112(%edi),%xmm6
|
||
|
movdqa %xmm5,-48(%edx)
|
||
|
pshufd $68,%xmm6,%xmm5
|
||
|
pshufd $238,%xmm6,%xmm6
|
||
|
movdqa %xmm5,112(%edx)
|
||
|
movdqu 128(%edi),%xmm5
|
||
|
movdqa %xmm6,-32(%edx)
|
||
|
pshufd $68,%xmm5,%xmm6
|
||
|
pshufd $238,%xmm5,%xmm5
|
||
|
movdqa %xmm6,128(%edx)
|
||
|
movdqa %xmm5,-16(%edx)
|
||
|
movdqu 32(%esi),%xmm5
|
||
|
movdqu 48(%esi),%xmm6
|
||
|
leal 32(%esi),%esi
|
||
|
movdqa %xmm2,112(%esp)
|
||
|
movdqa %xmm3,128(%esp)
|
||
|
movdqa %xmm4,144(%esp)
|
||
|
movdqa %xmm5,%xmm2
|
||
|
movdqa %xmm6,%xmm3
|
||
|
psrldq $6,%xmm2
|
||
|
psrldq $6,%xmm3
|
||
|
movdqa %xmm5,%xmm4
|
||
|
punpcklqdq %xmm3,%xmm2
|
||
|
punpckhqdq %xmm6,%xmm4
|
||
|
punpcklqdq %xmm6,%xmm5
|
||
|
movdqa %xmm2,%xmm3
|
||
|
psrlq $4,%xmm2
|
||
|
psrlq $30,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
psrlq $40,%xmm4
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm5
|
||
|
pand %xmm7,%xmm6
|
||
|
pand %xmm7,%xmm2
|
||
|
pand %xmm7,%xmm3
|
||
|
por (%ebx),%xmm4
|
||
|
movdqa %xmm0,80(%esp)
|
||
|
movdqa %xmm1,96(%esp)
|
||
|
jbe .L014skip_loop
|
||
|
jmp .L015loop
|
||
|
.align 32
|
||
|
.L015loop:
|
||
|
movdqa -144(%edx),%xmm7
|
||
|
movdqa %xmm6,16(%eax)
|
||
|
movdqa %xmm2,32(%eax)
|
||
|
movdqa %xmm3,48(%eax)
|
||
|
movdqa %xmm4,64(%eax)
|
||
|
movdqa %xmm5,%xmm1
|
||
|
pmuludq %xmm7,%xmm5
|
||
|
movdqa %xmm6,%xmm0
|
||
|
pmuludq %xmm7,%xmm6
|
||
|
pmuludq %xmm7,%xmm2
|
||
|
pmuludq %xmm7,%xmm3
|
||
|
pmuludq %xmm7,%xmm4
|
||
|
pmuludq -16(%edx),%xmm0
|
||
|
movdqa %xmm1,%xmm7
|
||
|
pmuludq -128(%edx),%xmm1
|
||
|
paddq %xmm5,%xmm0
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq -112(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm1
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq -96(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
movdqa 16(%eax),%xmm7
|
||
|
pmuludq -80(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm3
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq -128(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq -112(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
movdqa 32(%eax),%xmm7
|
||
|
pmuludq -96(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm3
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq -32(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq -16(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq -128(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm1
|
||
|
movdqa 48(%eax),%xmm5
|
||
|
pmuludq -112(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq -48(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm4
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq -32(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm0
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq -16(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm1
|
||
|
movdqa 64(%eax),%xmm6
|
||
|
pmuludq -128(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq -16(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq -64(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq -48(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
movdqa 64(%ebx),%xmm7
|
||
|
pmuludq -32(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm1
|
||
|
paddq %xmm6,%xmm2
|
||
|
movdqu -32(%esi),%xmm5
|
||
|
movdqu -16(%esi),%xmm6
|
||
|
leal 32(%esi),%esi
|
||
|
movdqa %xmm2,32(%esp)
|
||
|
movdqa %xmm3,48(%esp)
|
||
|
movdqa %xmm4,64(%esp)
|
||
|
movdqa %xmm5,%xmm2
|
||
|
movdqa %xmm6,%xmm3
|
||
|
psrldq $6,%xmm2
|
||
|
psrldq $6,%xmm3
|
||
|
movdqa %xmm5,%xmm4
|
||
|
punpcklqdq %xmm3,%xmm2
|
||
|
punpckhqdq %xmm6,%xmm4
|
||
|
punpcklqdq %xmm6,%xmm5
|
||
|
movdqa %xmm2,%xmm3
|
||
|
psrlq $4,%xmm2
|
||
|
psrlq $30,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
psrlq $40,%xmm4
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm5
|
||
|
pand %xmm7,%xmm6
|
||
|
pand %xmm7,%xmm2
|
||
|
pand %xmm7,%xmm3
|
||
|
por (%ebx),%xmm4
|
||
|
leal -32(%esi),%eax
|
||
|
subl $64,%ecx
|
||
|
paddd 80(%esp),%xmm5
|
||
|
paddd 96(%esp),%xmm6
|
||
|
paddd 112(%esp),%xmm2
|
||
|
paddd 128(%esp),%xmm3
|
||
|
paddd 144(%esp),%xmm4
|
||
|
cmovbl %eax,%esi
|
||
|
leal 160(%esp),%eax
|
||
|
movdqa (%edx),%xmm7
|
||
|
movdqa %xmm1,16(%esp)
|
||
|
movdqa %xmm6,16(%eax)
|
||
|
movdqa %xmm2,32(%eax)
|
||
|
movdqa %xmm3,48(%eax)
|
||
|
movdqa %xmm4,64(%eax)
|
||
|
movdqa %xmm5,%xmm1
|
||
|
pmuludq %xmm7,%xmm5
|
||
|
paddq %xmm0,%xmm5
|
||
|
movdqa %xmm6,%xmm0
|
||
|
pmuludq %xmm7,%xmm6
|
||
|
pmuludq %xmm7,%xmm2
|
||
|
pmuludq %xmm7,%xmm3
|
||
|
pmuludq %xmm7,%xmm4
|
||
|
paddq 16(%esp),%xmm6
|
||
|
paddq 32(%esp),%xmm2
|
||
|
paddq 48(%esp),%xmm3
|
||
|
paddq 64(%esp),%xmm4
|
||
|
pmuludq 128(%edx),%xmm0
|
||
|
movdqa %xmm1,%xmm7
|
||
|
pmuludq 16(%edx),%xmm1
|
||
|
paddq %xmm5,%xmm0
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 32(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm1
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 48(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
movdqa 16(%eax),%xmm7
|
||
|
pmuludq 64(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm3
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 16(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 32(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
movdqa 32(%eax),%xmm7
|
||
|
pmuludq 48(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm3
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 112(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 128(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 16(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm1
|
||
|
movdqa 48(%eax),%xmm5
|
||
|
pmuludq 32(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 96(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm4
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 112(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm0
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 128(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm1
|
||
|
movdqa 64(%eax),%xmm6
|
||
|
pmuludq 16(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 128(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 80(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 96(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
movdqa 64(%ebx),%xmm7
|
||
|
pmuludq 112(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm1
|
||
|
paddq %xmm6,%xmm2
|
||
|
movdqa %xmm3,%xmm5
|
||
|
pand %xmm7,%xmm3
|
||
|
psrlq $26,%xmm5
|
||
|
paddq %xmm4,%xmm5
|
||
|
movdqa %xmm0,%xmm6
|
||
|
pand %xmm7,%xmm0
|
||
|
psrlq $26,%xmm6
|
||
|
movdqa %xmm5,%xmm4
|
||
|
paddq %xmm1,%xmm6
|
||
|
psrlq $26,%xmm5
|
||
|
pand %xmm7,%xmm4
|
||
|
movdqa %xmm6,%xmm1
|
||
|
psrlq $26,%xmm6
|
||
|
paddd %xmm5,%xmm0
|
||
|
psllq $2,%xmm5
|
||
|
paddq %xmm2,%xmm6
|
||
|
paddq %xmm0,%xmm5
|
||
|
pand %xmm7,%xmm1
|
||
|
movdqa %xmm6,%xmm2
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm2
|
||
|
paddd %xmm3,%xmm6
|
||
|
movdqa %xmm5,%xmm0
|
||
|
psrlq $26,%xmm5
|
||
|
movdqa %xmm6,%xmm3
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm0
|
||
|
paddd %xmm5,%xmm1
|
||
|
pand %xmm7,%xmm3
|
||
|
paddd %xmm6,%xmm4
|
||
|
movdqu 32(%esi),%xmm5
|
||
|
movdqu 48(%esi),%xmm6
|
||
|
leal 32(%esi),%esi
|
||
|
movdqa %xmm2,112(%esp)
|
||
|
movdqa %xmm3,128(%esp)
|
||
|
movdqa %xmm4,144(%esp)
|
||
|
movdqa %xmm5,%xmm2
|
||
|
movdqa %xmm6,%xmm3
|
||
|
psrldq $6,%xmm2
|
||
|
psrldq $6,%xmm3
|
||
|
movdqa %xmm5,%xmm4
|
||
|
punpcklqdq %xmm3,%xmm2
|
||
|
punpckhqdq %xmm6,%xmm4
|
||
|
punpcklqdq %xmm6,%xmm5
|
||
|
movdqa %xmm2,%xmm3
|
||
|
psrlq $4,%xmm2
|
||
|
psrlq $30,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
psrlq $40,%xmm4
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm5
|
||
|
pand %xmm7,%xmm6
|
||
|
pand %xmm7,%xmm2
|
||
|
pand %xmm7,%xmm3
|
||
|
por (%ebx),%xmm4
|
||
|
movdqa %xmm0,80(%esp)
|
||
|
movdqa %xmm1,96(%esp)
|
||
|
ja .L015loop
|
||
|
.L014skip_loop:
|
||
|
pshufd $16,-144(%edx),%xmm7
|
||
|
addl $32,%ecx
|
||
|
jnz .L016long_tail
|
||
|
paddd %xmm0,%xmm5
|
||
|
paddd %xmm1,%xmm6
|
||
|
paddd 112(%esp),%xmm2
|
||
|
paddd 128(%esp),%xmm3
|
||
|
paddd 144(%esp),%xmm4
|
||
|
.L016long_tail:
|
||
|
movdqa %xmm5,(%eax)
|
||
|
movdqa %xmm6,16(%eax)
|
||
|
movdqa %xmm2,32(%eax)
|
||
|
movdqa %xmm3,48(%eax)
|
||
|
movdqa %xmm4,64(%eax)
|
||
|
pmuludq %xmm7,%xmm5
|
||
|
pmuludq %xmm7,%xmm6
|
||
|
pmuludq %xmm7,%xmm2
|
||
|
movdqa %xmm5,%xmm0
|
||
|
pshufd $16,-128(%edx),%xmm5
|
||
|
pmuludq %xmm7,%xmm3
|
||
|
movdqa %xmm6,%xmm1
|
||
|
pmuludq %xmm7,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 48(%eax),%xmm5
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 32(%eax),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 16(%eax),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
pshufd $16,-64(%edx),%xmm6
|
||
|
pmuludq (%eax),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
pmuludq 64(%eax),%xmm6
|
||
|
pshufd $16,-112(%edx),%xmm7
|
||
|
paddq %xmm5,%xmm1
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 32(%eax),%xmm7
|
||
|
paddq %xmm6,%xmm0
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 16(%eax),%xmm5
|
||
|
paddq %xmm7,%xmm4
|
||
|
pshufd $16,-48(%edx),%xmm7
|
||
|
pmuludq (%eax),%xmm6
|
||
|
paddq %xmm5,%xmm3
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 64(%eax),%xmm7
|
||
|
paddq %xmm6,%xmm2
|
||
|
pmuludq 48(%eax),%xmm5
|
||
|
pshufd $16,-96(%edx),%xmm6
|
||
|
paddq %xmm7,%xmm1
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 16(%eax),%xmm6
|
||
|
paddq %xmm5,%xmm0
|
||
|
pshufd $16,-32(%edx),%xmm5
|
||
|
pmuludq (%eax),%xmm7
|
||
|
paddq %xmm6,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 64(%eax),%xmm5
|
||
|
paddq %xmm7,%xmm3
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 48(%eax),%xmm6
|
||
|
paddq %xmm5,%xmm2
|
||
|
pmuludq 32(%eax),%xmm7
|
||
|
pshufd $16,-80(%edx),%xmm5
|
||
|
paddq %xmm6,%xmm1
|
||
|
pshufd $16,-16(%edx),%xmm6
|
||
|
pmuludq (%eax),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 64(%eax),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 16(%eax),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 32(%eax),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
pmuludq 48(%eax),%xmm6
|
||
|
movdqa 64(%ebx),%xmm7
|
||
|
paddq %xmm5,%xmm1
|
||
|
paddq %xmm6,%xmm2
|
||
|
jz .L017short_tail
|
||
|
movdqu -32(%esi),%xmm5
|
||
|
movdqu -16(%esi),%xmm6
|
||
|
leal 32(%esi),%esi
|
||
|
movdqa %xmm2,32(%esp)
|
||
|
movdqa %xmm3,48(%esp)
|
||
|
movdqa %xmm4,64(%esp)
|
||
|
movdqa %xmm5,%xmm2
|
||
|
movdqa %xmm6,%xmm3
|
||
|
psrldq $6,%xmm2
|
||
|
psrldq $6,%xmm3
|
||
|
movdqa %xmm5,%xmm4
|
||
|
punpcklqdq %xmm3,%xmm2
|
||
|
punpckhqdq %xmm6,%xmm4
|
||
|
punpcklqdq %xmm6,%xmm5
|
||
|
movdqa %xmm2,%xmm3
|
||
|
psrlq $4,%xmm2
|
||
|
psrlq $30,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
psrlq $40,%xmm4
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm5
|
||
|
pand %xmm7,%xmm6
|
||
|
pand %xmm7,%xmm2
|
||
|
pand %xmm7,%xmm3
|
||
|
por (%ebx),%xmm4
|
||
|
pshufd $16,(%edx),%xmm7
|
||
|
paddd 80(%esp),%xmm5
|
||
|
paddd 96(%esp),%xmm6
|
||
|
paddd 112(%esp),%xmm2
|
||
|
paddd 128(%esp),%xmm3
|
||
|
paddd 144(%esp),%xmm4
|
||
|
movdqa %xmm5,(%esp)
|
||
|
pmuludq %xmm7,%xmm5
|
||
|
movdqa %xmm6,16(%esp)
|
||
|
pmuludq %xmm7,%xmm6
|
||
|
paddq %xmm5,%xmm0
|
||
|
movdqa %xmm2,%xmm5
|
||
|
pmuludq %xmm7,%xmm2
|
||
|
paddq %xmm6,%xmm1
|
||
|
movdqa %xmm3,%xmm6
|
||
|
pmuludq %xmm7,%xmm3
|
||
|
paddq 32(%esp),%xmm2
|
||
|
movdqa %xmm5,32(%esp)
|
||
|
pshufd $16,16(%edx),%xmm5
|
||
|
paddq 48(%esp),%xmm3
|
||
|
movdqa %xmm6,48(%esp)
|
||
|
movdqa %xmm4,%xmm6
|
||
|
pmuludq %xmm7,%xmm4
|
||
|
paddq 64(%esp),%xmm4
|
||
|
movdqa %xmm6,64(%esp)
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 48(%esp),%xmm5
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 32(%esp),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 16(%esp),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
pshufd $16,80(%edx),%xmm6
|
||
|
pmuludq (%esp),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
pmuludq 64(%esp),%xmm6
|
||
|
pshufd $16,32(%edx),%xmm7
|
||
|
paddq %xmm5,%xmm1
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 32(%esp),%xmm7
|
||
|
paddq %xmm6,%xmm0
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 16(%esp),%xmm5
|
||
|
paddq %xmm7,%xmm4
|
||
|
pshufd $16,96(%edx),%xmm7
|
||
|
pmuludq (%esp),%xmm6
|
||
|
paddq %xmm5,%xmm3
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 64(%esp),%xmm7
|
||
|
paddq %xmm6,%xmm2
|
||
|
pmuludq 48(%esp),%xmm5
|
||
|
pshufd $16,48(%edx),%xmm6
|
||
|
paddq %xmm7,%xmm1
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 16(%esp),%xmm6
|
||
|
paddq %xmm5,%xmm0
|
||
|
pshufd $16,112(%edx),%xmm5
|
||
|
pmuludq (%esp),%xmm7
|
||
|
paddq %xmm6,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 64(%esp),%xmm5
|
||
|
paddq %xmm7,%xmm3
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 48(%esp),%xmm6
|
||
|
paddq %xmm5,%xmm2
|
||
|
pmuludq 32(%esp),%xmm7
|
||
|
pshufd $16,64(%edx),%xmm5
|
||
|
paddq %xmm6,%xmm1
|
||
|
pshufd $16,128(%edx),%xmm6
|
||
|
pmuludq (%esp),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 64(%esp),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 16(%esp),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 32(%esp),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
pmuludq 48(%esp),%xmm6
|
||
|
movdqa 64(%ebx),%xmm7
|
||
|
paddq %xmm5,%xmm1
|
||
|
paddq %xmm6,%xmm2
|
||
|
.L017short_tail:
|
||
|
pshufd $78,%xmm4,%xmm6
|
||
|
pshufd $78,%xmm3,%xmm5
|
||
|
paddq %xmm6,%xmm4
|
||
|
paddq %xmm5,%xmm3
|
||
|
pshufd $78,%xmm0,%xmm6
|
||
|
pshufd $78,%xmm1,%xmm5
|
||
|
paddq %xmm6,%xmm0
|
||
|
paddq %xmm5,%xmm1
|
||
|
pshufd $78,%xmm2,%xmm6
|
||
|
movdqa %xmm3,%xmm5
|
||
|
pand %xmm7,%xmm3
|
||
|
psrlq $26,%xmm5
|
||
|
paddq %xmm6,%xmm2
|
||
|
paddq %xmm4,%xmm5
|
||
|
movdqa %xmm0,%xmm6
|
||
|
pand %xmm7,%xmm0
|
||
|
psrlq $26,%xmm6
|
||
|
movdqa %xmm5,%xmm4
|
||
|
paddq %xmm1,%xmm6
|
||
|
psrlq $26,%xmm5
|
||
|
pand %xmm7,%xmm4
|
||
|
movdqa %xmm6,%xmm1
|
||
|
psrlq $26,%xmm6
|
||
|
paddd %xmm5,%xmm0
|
||
|
psllq $2,%xmm5
|
||
|
paddq %xmm2,%xmm6
|
||
|
paddq %xmm0,%xmm5
|
||
|
pand %xmm7,%xmm1
|
||
|
movdqa %xmm6,%xmm2
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm2
|
||
|
paddd %xmm3,%xmm6
|
||
|
movdqa %xmm5,%xmm0
|
||
|
psrlq $26,%xmm5
|
||
|
movdqa %xmm6,%xmm3
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm0
|
||
|
paddd %xmm5,%xmm1
|
||
|
pand %xmm7,%xmm3
|
||
|
paddd %xmm6,%xmm4
|
||
|
.L013done:
|
||
|
movd %xmm0,-48(%edi)
|
||
|
movd %xmm1,-44(%edi)
|
||
|
movd %xmm2,-40(%edi)
|
||
|
movd %xmm3,-36(%edi)
|
||
|
movd %xmm4,-32(%edi)
|
||
|
movl %ebp,%esp
|
||
|
.L007nodata:
|
||
|
popl %edi
|
||
|
popl %esi
|
||
|
popl %ebx
|
||
|
popl %ebp
|
||
|
ret
|
||
|
.size _poly1305_blocks_sse2,.-_poly1305_blocks_sse2
|
||
|
.align 32
|
||
|
.type _poly1305_emit_sse2,@function
|
||
|
.align 16
|
||
|
_poly1305_emit_sse2:
|
||
|
pushl %ebp
|
||
|
pushl %ebx
|
||
|
pushl %esi
|
||
|
pushl %edi
|
||
|
movl 20(%esp),%ebp
|
||
|
cmpl $0,20(%ebp)
|
||
|
je .Lenter_emit
|
||
|
movl (%ebp),%eax
|
||
|
movl 4(%ebp),%edi
|
||
|
movl 8(%ebp),%ecx
|
||
|
movl 12(%ebp),%edx
|
||
|
movl 16(%ebp),%esi
|
||
|
movl %edi,%ebx
|
||
|
shll $26,%edi
|
||
|
shrl $6,%ebx
|
||
|
addl %edi,%eax
|
||
|
movl %ecx,%edi
|
||
|
adcl $0,%ebx
|
||
|
shll $20,%edi
|
||
|
shrl $12,%ecx
|
||
|
addl %edi,%ebx
|
||
|
movl %edx,%edi
|
||
|
adcl $0,%ecx
|
||
|
shll $14,%edi
|
||
|
shrl $18,%edx
|
||
|
addl %edi,%ecx
|
||
|
movl %esi,%edi
|
||
|
adcl $0,%edx
|
||
|
shll $8,%edi
|
||
|
shrl $24,%esi
|
||
|
addl %edi,%edx
|
||
|
adcl $0,%esi
|
||
|
movl %esi,%edi
|
||
|
andl $3,%esi
|
||
|
shrl $2,%edi
|
||
|
leal (%edi,%edi,4),%ebp
|
||
|
movl 24(%esp),%edi
|
||
|
addl %ebp,%eax
|
||
|
movl 28(%esp),%ebp
|
||
|
adcl $0,%ebx
|
||
|
adcl $0,%ecx
|
||
|
adcl $0,%edx
|
||
|
adcl $0,%esi
|
||
|
movd %eax,%xmm0
|
||
|
addl $5,%eax
|
||
|
movd %ebx,%xmm1
|
||
|
adcl $0,%ebx
|
||
|
movd %ecx,%xmm2
|
||
|
adcl $0,%ecx
|
||
|
movd %edx,%xmm3
|
||
|
adcl $0,%edx
|
||
|
adcl $0,%esi
|
||
|
shrl $2,%esi
|
||
|
negl %esi
|
||
|
andl %esi,%eax
|
||
|
andl %esi,%ebx
|
||
|
andl %esi,%ecx
|
||
|
andl %esi,%edx
|
||
|
movl %eax,(%edi)
|
||
|
movd %xmm0,%eax
|
||
|
movl %ebx,4(%edi)
|
||
|
movd %xmm1,%ebx
|
||
|
movl %ecx,8(%edi)
|
||
|
movd %xmm2,%ecx
|
||
|
movl %edx,12(%edi)
|
||
|
movd %xmm3,%edx
|
||
|
notl %esi
|
||
|
andl %esi,%eax
|
||
|
andl %esi,%ebx
|
||
|
orl (%edi),%eax
|
||
|
andl %esi,%ecx
|
||
|
orl 4(%edi),%ebx
|
||
|
andl %esi,%edx
|
||
|
orl 8(%edi),%ecx
|
||
|
orl 12(%edi),%edx
|
||
|
addl (%ebp),%eax
|
||
|
adcl 4(%ebp),%ebx
|
||
|
movl %eax,(%edi)
|
||
|
adcl 8(%ebp),%ecx
|
||
|
movl %ebx,4(%edi)
|
||
|
adcl 12(%ebp),%edx
|
||
|
movl %ecx,8(%edi)
|
||
|
movl %edx,12(%edi)
|
||
|
popl %edi
|
||
|
popl %esi
|
||
|
popl %ebx
|
||
|
popl %ebp
|
||
|
ret
|
||
|
.size _poly1305_emit_sse2,.-_poly1305_emit_sse2
|
||
|
.align 32
|
||
|
.type _poly1305_init_avx2,@function
|
||
|
.align 16
|
||
|
_poly1305_init_avx2:
|
||
|
vmovdqu 24(%edi),%xmm4
|
||
|
leal 48(%edi),%edi
|
||
|
movl %esp,%ebp
|
||
|
subl $224,%esp
|
||
|
andl $-16,%esp
|
||
|
vmovdqa 64(%ebx),%xmm7
|
||
|
vpand %xmm7,%xmm4,%xmm0
|
||
|
vpsrlq $26,%xmm4,%xmm1
|
||
|
vpsrldq $6,%xmm4,%xmm3
|
||
|
vpand %xmm7,%xmm1,%xmm1
|
||
|
vpsrlq $4,%xmm3,%xmm2
|
||
|
vpsrlq $30,%xmm3,%xmm3
|
||
|
vpand %xmm7,%xmm2,%xmm2
|
||
|
vpand %xmm7,%xmm3,%xmm3
|
||
|
vpsrldq $13,%xmm4,%xmm4
|
||
|
leal 144(%esp),%edx
|
||
|
movl $2,%ecx
|
||
|
.L018square:
|
||
|
vmovdqa %xmm0,(%esp)
|
||
|
vmovdqa %xmm1,16(%esp)
|
||
|
vmovdqa %xmm2,32(%esp)
|
||
|
vmovdqa %xmm3,48(%esp)
|
||
|
vmovdqa %xmm4,64(%esp)
|
||
|
vpslld $2,%xmm1,%xmm6
|
||
|
vpslld $2,%xmm2,%xmm5
|
||
|
vpaddd %xmm1,%xmm6,%xmm6
|
||
|
vpaddd %xmm2,%xmm5,%xmm5
|
||
|
vmovdqa %xmm6,80(%esp)
|
||
|
vmovdqa %xmm5,96(%esp)
|
||
|
vpslld $2,%xmm3,%xmm6
|
||
|
vpslld $2,%xmm4,%xmm5
|
||
|
vpaddd %xmm3,%xmm6,%xmm6
|
||
|
vpaddd %xmm4,%xmm5,%xmm5
|
||
|
vmovdqa %xmm6,112(%esp)
|
||
|
vmovdqa %xmm5,128(%esp)
|
||
|
vpshufd $68,%xmm0,%xmm5
|
||
|
vmovdqa %xmm1,%xmm6
|
||
|
vpshufd $68,%xmm1,%xmm1
|
||
|
vpshufd $68,%xmm2,%xmm2
|
||
|
vpshufd $68,%xmm3,%xmm3
|
||
|
vpshufd $68,%xmm4,%xmm4
|
||
|
vmovdqa %xmm5,(%edx)
|
||
|
vmovdqa %xmm1,16(%edx)
|
||
|
vmovdqa %xmm2,32(%edx)
|
||
|
vmovdqa %xmm3,48(%edx)
|
||
|
vmovdqa %xmm4,64(%edx)
|
||
|
vpmuludq %xmm0,%xmm4,%xmm4
|
||
|
vpmuludq %xmm0,%xmm3,%xmm3
|
||
|
vpmuludq %xmm0,%xmm2,%xmm2
|
||
|
vpmuludq %xmm0,%xmm1,%xmm1
|
||
|
vpmuludq %xmm0,%xmm5,%xmm0
|
||
|
vpmuludq 48(%edx),%xmm6,%xmm5
|
||
|
vpaddq %xmm5,%xmm4,%xmm4
|
||
|
vpmuludq 32(%edx),%xmm6,%xmm7
|
||
|
vpaddq %xmm7,%xmm3,%xmm3
|
||
|
vpmuludq 16(%edx),%xmm6,%xmm5
|
||
|
vpaddq %xmm5,%xmm2,%xmm2
|
||
|
vmovdqa 80(%esp),%xmm7
|
||
|
vpmuludq (%edx),%xmm6,%xmm6
|
||
|
vpaddq %xmm6,%xmm1,%xmm1
|
||
|
vmovdqa 32(%esp),%xmm5
|
||
|
vpmuludq 64(%edx),%xmm7,%xmm7
|
||
|
vpaddq %xmm7,%xmm0,%xmm0
|
||
|
vpmuludq 32(%edx),%xmm5,%xmm6
|
||
|
vpaddq %xmm6,%xmm4,%xmm4
|
||
|
vpmuludq 16(%edx),%xmm5,%xmm7
|
||
|
vpaddq %xmm7,%xmm3,%xmm3
|
||
|
vmovdqa 96(%esp),%xmm6
|
||
|
vpmuludq (%edx),%xmm5,%xmm5
|
||
|
vpaddq %xmm5,%xmm2,%xmm2
|
||
|
vpmuludq 64(%edx),%xmm6,%xmm7
|
||
|
vpaddq %xmm7,%xmm1,%xmm1
|
||
|
vmovdqa 48(%esp),%xmm5
|
||
|
vpmuludq 48(%edx),%xmm6,%xmm6
|
||
|
vpaddq %xmm6,%xmm0,%xmm0
|
||
|
vpmuludq 16(%edx),%xmm5,%xmm7
|
||
|
vpaddq %xmm7,%xmm4,%xmm4
|
||
|
vmovdqa 112(%esp),%xmm6
|
||
|
vpmuludq (%edx),%xmm5,%xmm5
|
||
|
vpaddq %xmm5,%xmm3,%xmm3
|
||
|
vpmuludq 64(%edx),%xmm6,%xmm7
|
||
|
vpaddq %xmm7,%xmm2,%xmm2
|
||
|
vpmuludq 48(%edx),%xmm6,%xmm5
|
||
|
vpaddq %xmm5,%xmm1,%xmm1
|
||
|
vmovdqa 64(%esp),%xmm7
|
||
|
vpmuludq 32(%edx),%xmm6,%xmm6
|
||
|
vpaddq %xmm6,%xmm0,%xmm0
|
||
|
vmovdqa 128(%esp),%xmm5
|
||
|
vpmuludq (%edx),%xmm7,%xmm7
|
||
|
vpaddq %xmm7,%xmm4,%xmm4
|
||
|
vpmuludq 64(%edx),%xmm5,%xmm6
|
||
|
vpaddq %xmm6,%xmm3,%xmm3
|
||
|
vpmuludq 16(%edx),%xmm5,%xmm7
|
||
|
vpaddq %xmm7,%xmm0,%xmm0
|
||
|
vpmuludq 32(%edx),%xmm5,%xmm6
|
||
|
vpaddq %xmm6,%xmm1,%xmm1
|
||
|
vmovdqa 64(%ebx),%xmm7
|
||
|
vpmuludq 48(%edx),%xmm5,%xmm5
|
||
|
vpaddq %xmm5,%xmm2,%xmm2
|
||
|
vpsrlq $26,%xmm3,%xmm5
|
||
|
vpand %xmm7,%xmm3,%xmm3
|
||
|
vpsrlq $26,%xmm0,%xmm6
|
||
|
vpand %xmm7,%xmm0,%xmm0
|
||
|
vpaddq %xmm5,%xmm4,%xmm4
|
||
|
vpaddq %xmm6,%xmm1,%xmm1
|
||
|
vpsrlq $26,%xmm4,%xmm5
|
||
|
vpand %xmm7,%xmm4,%xmm4
|
||
|
vpsrlq $26,%xmm1,%xmm6
|
||
|
vpand %xmm7,%xmm1,%xmm1
|
||
|
vpaddq %xmm6,%xmm2,%xmm2
|
||
|
vpaddd %xmm5,%xmm0,%xmm0
|
||
|
vpsllq $2,%xmm5,%xmm5
|
||
|
vpsrlq $26,%xmm2,%xmm6
|
||
|
vpand %xmm7,%xmm2,%xmm2
|
||
|
vpaddd %xmm5,%xmm0,%xmm0
|
||
|
vpaddd %xmm6,%xmm3,%xmm3
|
||
|
vpsrlq $26,%xmm3,%xmm6
|
||
|
vpsrlq $26,%xmm0,%xmm5
|
||
|
vpand %xmm7,%xmm0,%xmm0
|
||
|
vpand %xmm7,%xmm3,%xmm3
|
||
|
vpaddd %xmm5,%xmm1,%xmm1
|
||
|
vpaddd %xmm6,%xmm4,%xmm4
|
||
|
decl %ecx
|
||
|
jz .L019square_break
|
||
|
vpunpcklqdq (%esp),%xmm0,%xmm0
|
||
|
vpunpcklqdq 16(%esp),%xmm1,%xmm1
|
||
|
vpunpcklqdq 32(%esp),%xmm2,%xmm2
|
||
|
vpunpcklqdq 48(%esp),%xmm3,%xmm3
|
||
|
vpunpcklqdq 64(%esp),%xmm4,%xmm4
|
||
|
jmp .L018square
|
||
|
.L019square_break:
|
||
|
vpsllq $32,%xmm0,%xmm0
|
||
|
vpsllq $32,%xmm1,%xmm1
|
||
|
vpsllq $32,%xmm2,%xmm2
|
||
|
vpsllq $32,%xmm3,%xmm3
|
||
|
vpsllq $32,%xmm4,%xmm4
|
||
|
vpor (%esp),%xmm0,%xmm0
|
||
|
vpor 16(%esp),%xmm1,%xmm1
|
||
|
vpor 32(%esp),%xmm2,%xmm2
|
||
|
vpor 48(%esp),%xmm3,%xmm3
|
||
|
vpor 64(%esp),%xmm4,%xmm4
|
||
|
vpshufd $141,%xmm0,%xmm0
|
||
|
vpshufd $141,%xmm1,%xmm1
|
||
|
vpshufd $141,%xmm2,%xmm2
|
||
|
vpshufd $141,%xmm3,%xmm3
|
||
|
vpshufd $141,%xmm4,%xmm4
|
||
|
vmovdqu %xmm0,(%edi)
|
||
|
vmovdqu %xmm1,16(%edi)
|
||
|
vmovdqu %xmm2,32(%edi)
|
||
|
vmovdqu %xmm3,48(%edi)
|
||
|
vmovdqu %xmm4,64(%edi)
|
||
|
vpslld $2,%xmm1,%xmm6
|
||
|
vpslld $2,%xmm2,%xmm5
|
||
|
vpaddd %xmm1,%xmm6,%xmm6
|
||
|
vpaddd %xmm2,%xmm5,%xmm5
|
||
|
vmovdqu %xmm6,80(%edi)
|
||
|
vmovdqu %xmm5,96(%edi)
|
||
|
vpslld $2,%xmm3,%xmm6
|
||
|
vpslld $2,%xmm4,%xmm5
|
||
|
vpaddd %xmm3,%xmm6,%xmm6
|
||
|
vpaddd %xmm4,%xmm5,%xmm5
|
||
|
vmovdqu %xmm6,112(%edi)
|
||
|
vmovdqu %xmm5,128(%edi)
|
||
|
movl %ebp,%esp
|
||
|
leal -48(%edi),%edi
|
||
|
ret
|
||
|
.size _poly1305_init_avx2,.-_poly1305_init_avx2
|
||
|
.align 32
|
||
|
.type _poly1305_blocks_avx2,@function
|
||
|
.align 16
|
||
|
_poly1305_blocks_avx2:
|
||
|
pushl %ebp
|
||
|
pushl %ebx
|
||
|
pushl %esi
|
||
|
pushl %edi
|
||
|
movl 20(%esp),%edi
|
||
|
movl 24(%esp),%esi
|
||
|
movl 28(%esp),%ecx
|
||
|
movl 20(%edi),%eax
|
||
|
andl $-16,%ecx
|
||
|
jz .L020nodata
|
||
|
cmpl $64,%ecx
|
||
|
jae .L021enter_avx2
|
||
|
testl %eax,%eax
|
||
|
jz .Lenter_blocks
|
||
|
.L021enter_avx2:
|
||
|
vzeroupper
|
||
|
call .L022pic_point
|
||
|
.L022pic_point:
|
||
|
popl %ebx
|
||
|
leal .Lconst_sse2-.L022pic_point(%ebx),%ebx
|
||
|
testl %eax,%eax
|
||
|
jnz .L023base2_26
|
||
|
call _poly1305_init_avx2
|
||
|
movl (%edi),%eax
|
||
|
movl 3(%edi),%ecx
|
||
|
movl 6(%edi),%edx
|
||
|
movl 9(%edi),%esi
|
||
|
movl 13(%edi),%ebp
|
||
|
shrl $2,%ecx
|
||
|
andl $67108863,%eax
|
||
|
shrl $4,%edx
|
||
|
andl $67108863,%ecx
|
||
|
shrl $6,%esi
|
||
|
andl $67108863,%edx
|
||
|
movl %eax,(%edi)
|
||
|
movl %ecx,4(%edi)
|
||
|
movl %edx,8(%edi)
|
||
|
movl %esi,12(%edi)
|
||
|
movl %ebp,16(%edi)
|
||
|
movl $1,20(%edi)
|
||
|
movl 24(%esp),%esi
|
||
|
movl 28(%esp),%ecx
|
||
|
.L023base2_26:
|
||
|
movl 32(%esp),%eax
|
||
|
movl %esp,%ebp
|
||
|
subl $448,%esp
|
||
|
andl $-512,%esp
|
||
|
vmovdqu 48(%edi),%xmm0
|
||
|
leal 288(%esp),%edx
|
||
|
vmovdqu 64(%edi),%xmm1
|
||
|
vmovdqu 80(%edi),%xmm2
|
||
|
vmovdqu 96(%edi),%xmm3
|
||
|
vmovdqu 112(%edi),%xmm4
|
||
|
leal 48(%edi),%edi
|
||
|
vpermq $64,%ymm0,%ymm0
|
||
|
vpermq $64,%ymm1,%ymm1
|
||
|
vpermq $64,%ymm2,%ymm2
|
||
|
vpermq $64,%ymm3,%ymm3
|
||
|
vpermq $64,%ymm4,%ymm4
|
||
|
vpshufd $200,%ymm0,%ymm0
|
||
|
vpshufd $200,%ymm1,%ymm1
|
||
|
vpshufd $200,%ymm2,%ymm2
|
||
|
vpshufd $200,%ymm3,%ymm3
|
||
|
vpshufd $200,%ymm4,%ymm4
|
||
|
vmovdqa %ymm0,-128(%edx)
|
||
|
vmovdqu 80(%edi),%xmm0
|
||
|
vmovdqa %ymm1,-96(%edx)
|
||
|
vmovdqu 96(%edi),%xmm1
|
||
|
vmovdqa %ymm2,-64(%edx)
|
||
|
vmovdqu 112(%edi),%xmm2
|
||
|
vmovdqa %ymm3,-32(%edx)
|
||
|
vmovdqu 128(%edi),%xmm3
|
||
|
vmovdqa %ymm4,(%edx)
|
||
|
vpermq $64,%ymm0,%ymm0
|
||
|
vpermq $64,%ymm1,%ymm1
|
||
|
vpermq $64,%ymm2,%ymm2
|
||
|
vpermq $64,%ymm3,%ymm3
|
||
|
vpshufd $200,%ymm0,%ymm0
|
||
|
vpshufd $200,%ymm1,%ymm1
|
||
|
vpshufd $200,%ymm2,%ymm2
|
||
|
vpshufd $200,%ymm3,%ymm3
|
||
|
vmovdqa %ymm0,32(%edx)
|
||
|
vmovd -48(%edi),%xmm0
|
||
|
vmovdqa %ymm1,64(%edx)
|
||
|
vmovd -44(%edi),%xmm1
|
||
|
vmovdqa %ymm2,96(%edx)
|
||
|
vmovd -40(%edi),%xmm2
|
||
|
vmovdqa %ymm3,128(%edx)
|
||
|
vmovd -36(%edi),%xmm3
|
||
|
vmovd -32(%edi),%xmm4
|
||
|
vmovdqa 64(%ebx),%ymm7
|
||
|
negl %eax
|
||
|
testl $63,%ecx
|
||
|
jz .L024even
|
||
|
movl %ecx,%edx
|
||
|
andl $-64,%ecx
|
||
|
andl $63,%edx
|
||
|
vmovdqu (%esi),%xmm5
|
||
|
cmpl $32,%edx
|
||
|
jb .L025one
|
||
|
vmovdqu 16(%esi),%xmm6
|
||
|
je .L026two
|
||
|
vinserti128 $1,32(%esi),%ymm5,%ymm5
|
||
|
leal 48(%esi),%esi
|
||
|
leal 8(%ebx),%ebx
|
||
|
leal 296(%esp),%edx
|
||
|
jmp .L027tail
|
||
|
.L026two:
|
||
|
leal 32(%esi),%esi
|
||
|
leal 16(%ebx),%ebx
|
||
|
leal 304(%esp),%edx
|
||
|
jmp .L027tail
|
||
|
.L025one:
|
||
|
leal 16(%esi),%esi
|
||
|
vpxor %ymm6,%ymm6,%ymm6
|
||
|
leal 32(%ebx,%eax,8),%ebx
|
||
|
leal 312(%esp),%edx
|
||
|
jmp .L027tail
|
||
|
.align 32
|
||
|
.L024even:
|
||
|
vmovdqu (%esi),%xmm5
|
||
|
vmovdqu 16(%esi),%xmm6
|
||
|
vinserti128 $1,32(%esi),%ymm5,%ymm5
|
||
|
vinserti128 $1,48(%esi),%ymm6,%ymm6
|
||
|
leal 64(%esi),%esi
|
||
|
subl $64,%ecx
|
||
|
jz .L027tail
|
||
|
.L028loop:
|
||
|
vmovdqa %ymm2,64(%esp)
|
||
|
vpsrldq $6,%ymm5,%ymm2
|
||
|
vmovdqa %ymm0,(%esp)
|
||
|
vpsrldq $6,%ymm6,%ymm0
|
||
|
vmovdqa %ymm1,32(%esp)
|
||
|
vpunpckhqdq %ymm6,%ymm5,%ymm1
|
||
|
vpunpcklqdq %ymm6,%ymm5,%ymm5
|
||
|
vpunpcklqdq %ymm0,%ymm2,%ymm2
|
||
|
vpsrlq $30,%ymm2,%ymm0
|
||
|
vpsrlq $4,%ymm2,%ymm2
|
||
|
vpsrlq $26,%ymm5,%ymm6
|
||
|
vpsrlq $40,%ymm1,%ymm1
|
||
|
vpand %ymm7,%ymm2,%ymm2
|
||
|
vpand %ymm7,%ymm5,%ymm5
|
||
|
vpand %ymm7,%ymm6,%ymm6
|
||
|
vpand %ymm7,%ymm0,%ymm0
|
||
|
vpor (%ebx),%ymm1,%ymm1
|
||
|
vpaddq 64(%esp),%ymm2,%ymm2
|
||
|
vpaddq (%esp),%ymm5,%ymm5
|
||
|
vpaddq 32(%esp),%ymm6,%ymm6
|
||
|
vpaddq %ymm3,%ymm0,%ymm0
|
||
|
vpaddq %ymm4,%ymm1,%ymm1
|
||
|
vpmuludq -96(%edx),%ymm2,%ymm3
|
||
|
vmovdqa %ymm6,32(%esp)
|
||
|
vpmuludq -64(%edx),%ymm2,%ymm4
|
||
|
vmovdqa %ymm0,96(%esp)
|
||
|
vpmuludq 96(%edx),%ymm2,%ymm0
|
||
|
vmovdqa %ymm1,128(%esp)
|
||
|
vpmuludq 128(%edx),%ymm2,%ymm1
|
||
|
vpmuludq -128(%edx),%ymm2,%ymm2
|
||
|
vpmuludq -32(%edx),%ymm5,%ymm7
|
||
|
vpaddq %ymm7,%ymm3,%ymm3
|
||
|
vpmuludq (%edx),%ymm5,%ymm6
|
||
|
vpaddq %ymm6,%ymm4,%ymm4
|
||
|
vpmuludq -128(%edx),%ymm5,%ymm7
|
||
|
vpaddq %ymm7,%ymm0,%ymm0
|
||
|
vmovdqa 32(%esp),%ymm7
|
||
|
vpmuludq -96(%edx),%ymm5,%ymm6
|
||
|
vpaddq %ymm6,%ymm1,%ymm1
|
||
|
vpmuludq -64(%edx),%ymm5,%ymm5
|
||
|
vpaddq %ymm5,%ymm2,%ymm2
|
||
|
vpmuludq -64(%edx),%ymm7,%ymm6
|
||
|
vpaddq %ymm6,%ymm3,%ymm3
|
||
|
vpmuludq -32(%edx),%ymm7,%ymm5
|
||
|
vpaddq %ymm5,%ymm4,%ymm4
|
||
|
vpmuludq 128(%edx),%ymm7,%ymm6
|
||
|
vpaddq %ymm6,%ymm0,%ymm0
|
||
|
vmovdqa 96(%esp),%ymm6
|
||
|
vpmuludq -128(%edx),%ymm7,%ymm5
|
||
|
vpaddq %ymm5,%ymm1,%ymm1
|
||
|
vpmuludq -96(%edx),%ymm7,%ymm7
|
||
|
vpaddq %ymm7,%ymm2,%ymm2
|
||
|
vpmuludq -128(%edx),%ymm6,%ymm5
|
||
|
vpaddq %ymm5,%ymm3,%ymm3
|
||
|
vpmuludq -96(%edx),%ymm6,%ymm7
|
||
|
vpaddq %ymm7,%ymm4,%ymm4
|
||
|
vpmuludq 64(%edx),%ymm6,%ymm5
|
||
|
vpaddq %ymm5,%ymm0,%ymm0
|
||
|
vmovdqa 128(%esp),%ymm5
|
||
|
vpmuludq 96(%edx),%ymm6,%ymm7
|
||
|
vpaddq %ymm7,%ymm1,%ymm1
|
||
|
vpmuludq 128(%edx),%ymm6,%ymm6
|
||
|
vpaddq %ymm6,%ymm2,%ymm2
|
||
|
vpmuludq 128(%edx),%ymm5,%ymm7
|
||
|
vpaddq %ymm7,%ymm3,%ymm3
|
||
|
vpmuludq 32(%edx),%ymm5,%ymm6
|
||
|
vpaddq %ymm6,%ymm0,%ymm0
|
||
|
vpmuludq -128(%edx),%ymm5,%ymm7
|
||
|
vpaddq %ymm7,%ymm4,%ymm4
|
||
|
vmovdqa 64(%ebx),%ymm7
|
||
|
vpmuludq 64(%edx),%ymm5,%ymm6
|
||
|
vpaddq %ymm6,%ymm1,%ymm1
|
||
|
vpmuludq 96(%edx),%ymm5,%ymm5
|
||
|
vpaddq %ymm5,%ymm2,%ymm2
|
||
|
vpsrlq $26,%ymm3,%ymm5
|
||
|
vpand %ymm7,%ymm3,%ymm3
|
||
|
vpsrlq $26,%ymm0,%ymm6
|
||
|
vpand %ymm7,%ymm0,%ymm0
|
||
|
vpaddq %ymm5,%ymm4,%ymm4
|
||
|
vpaddq %ymm6,%ymm1,%ymm1
|
||
|
vpsrlq $26,%ymm4,%ymm5
|
||
|
vpand %ymm7,%ymm4,%ymm4
|
||
|
vpsrlq $26,%ymm1,%ymm6
|
||
|
vpand %ymm7,%ymm1,%ymm1
|
||
|
vpaddq %ymm6,%ymm2,%ymm2
|
||
|
vpaddq %ymm5,%ymm0,%ymm0
|
||
|
vpsllq $2,%ymm5,%ymm5
|
||
|
vpsrlq $26,%ymm2,%ymm6
|
||
|
vpand %ymm7,%ymm2,%ymm2
|
||
|
vpaddq %ymm5,%ymm0,%ymm0
|
||
|
vpaddq %ymm6,%ymm3,%ymm3
|
||
|
vpsrlq $26,%ymm3,%ymm6
|
||
|
vpsrlq $26,%ymm0,%ymm5
|
||
|
vpand %ymm7,%ymm0,%ymm0
|
||
|
vpand %ymm7,%ymm3,%ymm3
|
||
|
vpaddq %ymm5,%ymm1,%ymm1
|
||
|
vpaddq %ymm6,%ymm4,%ymm4
|
||
|
vmovdqu (%esi),%xmm5
|
||
|
vmovdqu 16(%esi),%xmm6
|
||
|
vinserti128 $1,32(%esi),%ymm5,%ymm5
|
||
|
vinserti128 $1,48(%esi),%ymm6,%ymm6
|
||
|
leal 64(%esi),%esi
|
||
|
subl $64,%ecx
|
||
|
jnz .L028loop
|
||
|
.L027tail:
|
||
|
vmovdqa %ymm2,64(%esp)
|
||
|
vpsrldq $6,%ymm5,%ymm2
|
||
|
vmovdqa %ymm0,(%esp)
|
||
|
vpsrldq $6,%ymm6,%ymm0
|
||
|
vmovdqa %ymm1,32(%esp)
|
||
|
vpunpckhqdq %ymm6,%ymm5,%ymm1
|
||
|
vpunpcklqdq %ymm6,%ymm5,%ymm5
|
||
|
vpunpcklqdq %ymm0,%ymm2,%ymm2
|
||
|
vpsrlq $30,%ymm2,%ymm0
|
||
|
vpsrlq $4,%ymm2,%ymm2
|
||
|
vpsrlq $26,%ymm5,%ymm6
|
||
|
vpsrlq $40,%ymm1,%ymm1
|
||
|
vpand %ymm7,%ymm2,%ymm2
|
||
|
vpand %ymm7,%ymm5,%ymm5
|
||
|
vpand %ymm7,%ymm6,%ymm6
|
||
|
vpand %ymm7,%ymm0,%ymm0
|
||
|
vpor (%ebx),%ymm1,%ymm1
|
||
|
andl $-64,%ebx
|
||
|
vpaddq 64(%esp),%ymm2,%ymm2
|
||
|
vpaddq (%esp),%ymm5,%ymm5
|
||
|
vpaddq 32(%esp),%ymm6,%ymm6
|
||
|
vpaddq %ymm3,%ymm0,%ymm0
|
||
|
vpaddq %ymm4,%ymm1,%ymm1
|
||
|
vpmuludq -92(%edx),%ymm2,%ymm3
|
||
|
vmovdqa %ymm6,32(%esp)
|
||
|
vpmuludq -60(%edx),%ymm2,%ymm4
|
||
|
vmovdqa %ymm0,96(%esp)
|
||
|
vpmuludq 100(%edx),%ymm2,%ymm0
|
||
|
vmovdqa %ymm1,128(%esp)
|
||
|
vpmuludq 132(%edx),%ymm2,%ymm1
|
||
|
vpmuludq -124(%edx),%ymm2,%ymm2
|
||
|
vpmuludq -28(%edx),%ymm5,%ymm7
|
||
|
vpaddq %ymm7,%ymm3,%ymm3
|
||
|
vpmuludq 4(%edx),%ymm5,%ymm6
|
||
|
vpaddq %ymm6,%ymm4,%ymm4
|
||
|
vpmuludq -124(%edx),%ymm5,%ymm7
|
||
|
vpaddq %ymm7,%ymm0,%ymm0
|
||
|
vmovdqa 32(%esp),%ymm7
|
||
|
vpmuludq -92(%edx),%ymm5,%ymm6
|
||
|
vpaddq %ymm6,%ymm1,%ymm1
|
||
|
vpmuludq -60(%edx),%ymm5,%ymm5
|
||
|
vpaddq %ymm5,%ymm2,%ymm2
|
||
|
vpmuludq -60(%edx),%ymm7,%ymm6
|
||
|
vpaddq %ymm6,%ymm3,%ymm3
|
||
|
vpmuludq -28(%edx),%ymm7,%ymm5
|
||
|
vpaddq %ymm5,%ymm4,%ymm4
|
||
|
vpmuludq 132(%edx),%ymm7,%ymm6
|
||
|
vpaddq %ymm6,%ymm0,%ymm0
|
||
|
vmovdqa 96(%esp),%ymm6
|
||
|
vpmuludq -124(%edx),%ymm7,%ymm5
|
||
|
vpaddq %ymm5,%ymm1,%ymm1
|
||
|
vpmuludq -92(%edx),%ymm7,%ymm7
|
||
|
vpaddq %ymm7,%ymm2,%ymm2
|
||
|
vpmuludq -124(%edx),%ymm6,%ymm5
|
||
|
vpaddq %ymm5,%ymm3,%ymm3
|
||
|
vpmuludq -92(%edx),%ymm6,%ymm7
|
||
|
vpaddq %ymm7,%ymm4,%ymm4
|
||
|
vpmuludq 68(%edx),%ymm6,%ymm5
|
||
|
vpaddq %ymm5,%ymm0,%ymm0
|
||
|
vmovdqa 128(%esp),%ymm5
|
||
|
vpmuludq 100(%edx),%ymm6,%ymm7
|
||
|
vpaddq %ymm7,%ymm1,%ymm1
|
||
|
vpmuludq 132(%edx),%ymm6,%ymm6
|
||
|
vpaddq %ymm6,%ymm2,%ymm2
|
||
|
vpmuludq 132(%edx),%ymm5,%ymm7
|
||
|
vpaddq %ymm7,%ymm3,%ymm3
|
||
|
vpmuludq 36(%edx),%ymm5,%ymm6
|
||
|
vpaddq %ymm6,%ymm0,%ymm0
|
||
|
vpmuludq -124(%edx),%ymm5,%ymm7
|
||
|
vpaddq %ymm7,%ymm4,%ymm4
|
||
|
vmovdqa 64(%ebx),%ymm7
|
||
|
vpmuludq 68(%edx),%ymm5,%ymm6
|
||
|
vpaddq %ymm6,%ymm1,%ymm1
|
||
|
vpmuludq 100(%edx),%ymm5,%ymm5
|
||
|
vpaddq %ymm5,%ymm2,%ymm2
|
||
|
vpsrldq $8,%ymm4,%ymm5
|
||
|
vpsrldq $8,%ymm3,%ymm6
|
||
|
vpaddq %ymm5,%ymm4,%ymm4
|
||
|
vpsrldq $8,%ymm0,%ymm5
|
||
|
vpaddq %ymm6,%ymm3,%ymm3
|
||
|
vpsrldq $8,%ymm1,%ymm6
|
||
|
vpaddq %ymm5,%ymm0,%ymm0
|
||
|
vpsrldq $8,%ymm2,%ymm5
|
||
|
vpaddq %ymm6,%ymm1,%ymm1
|
||
|
vpermq $2,%ymm4,%ymm6
|
||
|
vpaddq %ymm5,%ymm2,%ymm2
|
||
|
vpermq $2,%ymm3,%ymm5
|
||
|
vpaddq %ymm6,%ymm4,%ymm4
|
||
|
vpermq $2,%ymm0,%ymm6
|
||
|
vpaddq %ymm5,%ymm3,%ymm3
|
||
|
vpermq $2,%ymm1,%ymm5
|
||
|
vpaddq %ymm6,%ymm0,%ymm0
|
||
|
vpermq $2,%ymm2,%ymm6
|
||
|
vpaddq %ymm5,%ymm1,%ymm1
|
||
|
vpaddq %ymm6,%ymm2,%ymm2
|
||
|
vpsrlq $26,%ymm3,%ymm5
|
||
|
vpand %ymm7,%ymm3,%ymm3
|
||
|
vpsrlq $26,%ymm0,%ymm6
|
||
|
vpand %ymm7,%ymm0,%ymm0
|
||
|
vpaddq %ymm5,%ymm4,%ymm4
|
||
|
vpaddq %ymm6,%ymm1,%ymm1
|
||
|
vpsrlq $26,%ymm4,%ymm5
|
||
|
vpand %ymm7,%ymm4,%ymm4
|
||
|
vpsrlq $26,%ymm1,%ymm6
|
||
|
vpand %ymm7,%ymm1,%ymm1
|
||
|
vpaddq %ymm6,%ymm2,%ymm2
|
||
|
vpaddq %ymm5,%ymm0,%ymm0
|
||
|
vpsllq $2,%ymm5,%ymm5
|
||
|
vpsrlq $26,%ymm2,%ymm6
|
||
|
vpand %ymm7,%ymm2,%ymm2
|
||
|
vpaddq %ymm5,%ymm0,%ymm0
|
||
|
vpaddq %ymm6,%ymm3,%ymm3
|
||
|
vpsrlq $26,%ymm3,%ymm6
|
||
|
vpsrlq $26,%ymm0,%ymm5
|
||
|
vpand %ymm7,%ymm0,%ymm0
|
||
|
vpand %ymm7,%ymm3,%ymm3
|
||
|
vpaddq %ymm5,%ymm1,%ymm1
|
||
|
vpaddq %ymm6,%ymm4,%ymm4
|
||
|
cmpl $0,%ecx
|
||
|
je .L029done
|
||
|
vpshufd $252,%xmm0,%xmm0
|
||
|
leal 288(%esp),%edx
|
||
|
vpshufd $252,%xmm1,%xmm1
|
||
|
vpshufd $252,%xmm2,%xmm2
|
||
|
vpshufd $252,%xmm3,%xmm3
|
||
|
vpshufd $252,%xmm4,%xmm4
|
||
|
jmp .L024even
|
||
|
.align 16
|
||
|
.L029done:
|
||
|
vmovd %xmm0,-48(%edi)
|
||
|
vmovd %xmm1,-44(%edi)
|
||
|
vmovd %xmm2,-40(%edi)
|
||
|
vmovd %xmm3,-36(%edi)
|
||
|
vmovd %xmm4,-32(%edi)
|
||
|
vzeroupper
|
||
|
movl %ebp,%esp
|
||
|
.L020nodata:
|
||
|
popl %edi
|
||
|
popl %esi
|
||
|
popl %ebx
|
||
|
popl %ebp
|
||
|
ret
|
||
|
.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2
|
||
|
.align 64
|
||
|
.Lconst_sse2:
|
||
|
.long 16777216,0,16777216,0,16777216,0,16777216,0
|
||
|
.long 0,0,0,0,0,0,0,0
|
||
|
.long 67108863,0,67108863,0,67108863,0,67108863,0
|
||
|
.long 268435455,268435452,268435452,268435452
|
||
|
.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
|
||
|
.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
|
||
|
.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
|
||
|
.byte 114,103,62,0
|
||
|
.align 4
|
||
|
.comm OPENSSL_ia32cap_P,16,4
|
||
|
#else
|
||
|
.text
|
||
|
.align 64
|
||
|
.globl poly1305_init
|
||
|
.type poly1305_init,@function
|
||
|
.align 16
|
||
|
poly1305_init:
|
||
|
.L_poly1305_init_begin:
|
||
|
pushl %ebp
|
||
|
pushl %ebx
|
||
|
pushl %esi
|
||
|
pushl %edi
|
||
|
movl 20(%esp),%edi
|
||
|
movl 24(%esp),%esi
|
||
|
movl 28(%esp),%ebp
|
||
|
xorl %eax,%eax
|
||
|
movl %eax,(%edi)
|
||
|
movl %eax,4(%edi)
|
||
|
movl %eax,8(%edi)
|
||
|
movl %eax,12(%edi)
|
||
|
movl %eax,16(%edi)
|
||
|
movl %eax,20(%edi)
|
||
|
cmpl $0,%esi
|
||
|
je .L000nokey
|
||
|
call .L001pic_point
|
||
|
.L001pic_point:
|
||
|
popl %ebx
|
||
|
leal poly1305_blocks-.L001pic_point(%ebx),%eax
|
||
|
leal poly1305_emit-.L001pic_point(%ebx),%edx
|
||
|
leal OPENSSL_ia32cap_P,%edi
|
||
|
movl (%edi),%ecx
|
||
|
andl $83886080,%ecx
|
||
|
cmpl $83886080,%ecx
|
||
|
jne .L002no_sse2
|
||
|
leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
|
||
|
leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx
|
||
|
movl 8(%edi),%ecx
|
||
|
testl $32,%ecx
|
||
|
jz .L002no_sse2
|
||
|
leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
|
||
|
.L002no_sse2:
|
||
|
movl 20(%esp),%edi
|
||
|
movl %eax,(%ebp)
|
||
|
movl %edx,4(%ebp)
|
||
|
movl (%esi),%eax
|
||
|
movl 4(%esi),%ebx
|
||
|
movl 8(%esi),%ecx
|
||
|
movl 12(%esi),%edx
|
||
|
andl $268435455,%eax
|
||
|
andl $268435452,%ebx
|
||
|
andl $268435452,%ecx
|
||
|
andl $268435452,%edx
|
||
|
movl %eax,24(%edi)
|
||
|
movl %ebx,28(%edi)
|
||
|
movl %ecx,32(%edi)
|
||
|
movl %edx,36(%edi)
|
||
|
movl $1,%eax
|
||
|
.L000nokey:
|
||
|
popl %edi
|
||
|
popl %esi
|
||
|
popl %ebx
|
||
|
popl %ebp
|
||
|
ret
|
||
|
.size poly1305_init,.-.L_poly1305_init_begin
|
||
|
.globl poly1305_blocks
|
||
|
.type poly1305_blocks,@function
|
||
|
.align 16
|
||
|
poly1305_blocks:
|
||
|
.L_poly1305_blocks_begin:
|
||
|
pushl %ebp
|
||
|
pushl %ebx
|
||
|
pushl %esi
|
||
|
pushl %edi
|
||
|
movl 20(%esp),%edi
|
||
|
movl 24(%esp),%esi
|
||
|
movl 28(%esp),%ecx
|
||
|
.Lenter_blocks:
|
||
|
andl $-15,%ecx
|
||
|
jz .L003nodata
|
||
|
subl $64,%esp
|
||
|
movl 24(%edi),%eax
|
||
|
movl 28(%edi),%ebx
|
||
|
leal (%esi,%ecx,1),%ebp
|
||
|
movl 32(%edi),%ecx
|
||
|
movl 36(%edi),%edx
|
||
|
movl %ebp,92(%esp)
|
||
|
movl %esi,%ebp
|
||
|
movl %eax,36(%esp)
|
||
|
movl %ebx,%eax
|
||
|
shrl $2,%eax
|
||
|
movl %ebx,40(%esp)
|
||
|
addl %ebx,%eax
|
||
|
movl %ecx,%ebx
|
||
|
shrl $2,%ebx
|
||
|
movl %ecx,44(%esp)
|
||
|
addl %ecx,%ebx
|
||
|
movl %edx,%ecx
|
||
|
shrl $2,%ecx
|
||
|
movl %edx,48(%esp)
|
||
|
addl %edx,%ecx
|
||
|
movl %eax,52(%esp)
|
||
|
movl %ebx,56(%esp)
|
||
|
movl %ecx,60(%esp)
|
||
|
movl (%edi),%eax
|
||
|
movl 4(%edi),%ebx
|
||
|
movl 8(%edi),%ecx
|
||
|
movl 12(%edi),%esi
|
||
|
movl 16(%edi),%edi
|
||
|
jmp .L004loop
|
||
|
.align 32
|
||
|
.L004loop:
|
||
|
addl (%ebp),%eax
|
||
|
adcl 4(%ebp),%ebx
|
||
|
adcl 8(%ebp),%ecx
|
||
|
adcl 12(%ebp),%esi
|
||
|
leal 16(%ebp),%ebp
|
||
|
adcl 96(%esp),%edi
|
||
|
movl %eax,(%esp)
|
||
|
movl %esi,12(%esp)
|
||
|
mull 36(%esp)
|
||
|
movl %edi,16(%esp)
|
||
|
movl %eax,%edi
|
||
|
movl %ebx,%eax
|
||
|
movl %edx,%esi
|
||
|
mull 60(%esp)
|
||
|
addl %eax,%edi
|
||
|
movl %ecx,%eax
|
||
|
adcl %edx,%esi
|
||
|
mull 56(%esp)
|
||
|
addl %eax,%edi
|
||
|
movl 12(%esp),%eax
|
||
|
adcl %edx,%esi
|
||
|
mull 52(%esp)
|
||
|
addl %eax,%edi
|
||
|
movl (%esp),%eax
|
||
|
adcl %edx,%esi
|
||
|
mull 40(%esp)
|
||
|
movl %edi,20(%esp)
|
||
|
xorl %edi,%edi
|
||
|
addl %eax,%esi
|
||
|
movl %ebx,%eax
|
||
|
adcl %edx,%edi
|
||
|
mull 36(%esp)
|
||
|
addl %eax,%esi
|
||
|
movl %ecx,%eax
|
||
|
adcl %edx,%edi
|
||
|
mull 60(%esp)
|
||
|
addl %eax,%esi
|
||
|
movl 12(%esp),%eax
|
||
|
adcl %edx,%edi
|
||
|
mull 56(%esp)
|
||
|
addl %eax,%esi
|
||
|
movl 16(%esp),%eax
|
||
|
adcl %edx,%edi
|
||
|
imull 52(%esp),%eax
|
||
|
addl %eax,%esi
|
||
|
movl (%esp),%eax
|
||
|
adcl $0,%edi
|
||
|
mull 44(%esp)
|
||
|
movl %esi,24(%esp)
|
||
|
xorl %esi,%esi
|
||
|
addl %eax,%edi
|
||
|
movl %ebx,%eax
|
||
|
adcl %edx,%esi
|
||
|
mull 40(%esp)
|
||
|
addl %eax,%edi
|
||
|
movl %ecx,%eax
|
||
|
adcl %edx,%esi
|
||
|
mull 36(%esp)
|
||
|
addl %eax,%edi
|
||
|
movl 12(%esp),%eax
|
||
|
adcl %edx,%esi
|
||
|
mull 60(%esp)
|
||
|
addl %eax,%edi
|
||
|
movl 16(%esp),%eax
|
||
|
adcl %edx,%esi
|
||
|
imull 56(%esp),%eax
|
||
|
addl %eax,%edi
|
||
|
movl (%esp),%eax
|
||
|
adcl $0,%esi
|
||
|
mull 48(%esp)
|
||
|
movl %edi,28(%esp)
|
||
|
xorl %edi,%edi
|
||
|
addl %eax,%esi
|
||
|
movl %ebx,%eax
|
||
|
adcl %edx,%edi
|
||
|
mull 44(%esp)
|
||
|
addl %eax,%esi
|
||
|
movl %ecx,%eax
|
||
|
adcl %edx,%edi
|
||
|
mull 40(%esp)
|
||
|
addl %eax,%esi
|
||
|
movl 12(%esp),%eax
|
||
|
adcl %edx,%edi
|
||
|
mull 36(%esp)
|
||
|
addl %eax,%esi
|
||
|
movl 16(%esp),%ecx
|
||
|
adcl %edx,%edi
|
||
|
movl %ecx,%edx
|
||
|
imull 60(%esp),%ecx
|
||
|
addl %ecx,%esi
|
||
|
movl 20(%esp),%eax
|
||
|
adcl $0,%edi
|
||
|
imull 36(%esp),%edx
|
||
|
addl %edi,%edx
|
||
|
movl 24(%esp),%ebx
|
||
|
movl 28(%esp),%ecx
|
||
|
movl %edx,%edi
|
||
|
shrl $2,%edx
|
||
|
andl $3,%edi
|
||
|
leal (%edx,%edx,4),%edx
|
||
|
addl %edx,%eax
|
||
|
adcl $0,%ebx
|
||
|
adcl $0,%ecx
|
||
|
adcl $0,%esi
|
||
|
adcl $0,%edi
|
||
|
cmpl 92(%esp),%ebp
|
||
|
jne .L004loop
|
||
|
movl 84(%esp),%edx
|
||
|
addl $64,%esp
|
||
|
movl %eax,(%edx)
|
||
|
movl %ebx,4(%edx)
|
||
|
movl %ecx,8(%edx)
|
||
|
movl %esi,12(%edx)
|
||
|
movl %edi,16(%edx)
|
||
|
.L003nodata:
|
||
|
popl %edi
|
||
|
popl %esi
|
||
|
popl %ebx
|
||
|
popl %ebp
|
||
|
ret
|
||
|
.size poly1305_blocks,.-.L_poly1305_blocks_begin
|
||
|
.globl poly1305_emit
|
||
|
.type poly1305_emit,@function
|
||
|
.align 16
|
||
|
poly1305_emit:
|
||
|
.L_poly1305_emit_begin:
|
||
|
pushl %ebp
|
||
|
pushl %ebx
|
||
|
pushl %esi
|
||
|
pushl %edi
|
||
|
movl 20(%esp),%ebp
|
||
|
.Lenter_emit:
|
||
|
movl 24(%esp),%edi
|
||
|
movl (%ebp),%eax
|
||
|
movl 4(%ebp),%ebx
|
||
|
movl 8(%ebp),%ecx
|
||
|
movl 12(%ebp),%edx
|
||
|
movl 16(%ebp),%esi
|
||
|
addl $5,%eax
|
||
|
adcl $0,%ebx
|
||
|
adcl $0,%ecx
|
||
|
adcl $0,%edx
|
||
|
adcl $0,%esi
|
||
|
shrl $2,%esi
|
||
|
negl %esi
|
||
|
andl %esi,%eax
|
||
|
andl %esi,%ebx
|
||
|
andl %esi,%ecx
|
||
|
andl %esi,%edx
|
||
|
movl %eax,(%edi)
|
||
|
movl %ebx,4(%edi)
|
||
|
movl %ecx,8(%edi)
|
||
|
movl %edx,12(%edi)
|
||
|
notl %esi
|
||
|
movl (%ebp),%eax
|
||
|
movl 4(%ebp),%ebx
|
||
|
movl 8(%ebp),%ecx
|
||
|
movl 12(%ebp),%edx
|
||
|
movl 28(%esp),%ebp
|
||
|
andl %esi,%eax
|
||
|
andl %esi,%ebx
|
||
|
andl %esi,%ecx
|
||
|
andl %esi,%edx
|
||
|
orl (%edi),%eax
|
||
|
orl 4(%edi),%ebx
|
||
|
orl 8(%edi),%ecx
|
||
|
orl 12(%edi),%edx
|
||
|
addl (%ebp),%eax
|
||
|
adcl 4(%ebp),%ebx
|
||
|
adcl 8(%ebp),%ecx
|
||
|
adcl 12(%ebp),%edx
|
||
|
movl %eax,(%edi)
|
||
|
movl %ebx,4(%edi)
|
||
|
movl %ecx,8(%edi)
|
||
|
movl %edx,12(%edi)
|
||
|
popl %edi
|
||
|
popl %esi
|
||
|
popl %ebx
|
||
|
popl %ebp
|
||
|
ret
|
||
|
.size poly1305_emit,.-.L_poly1305_emit_begin
|
||
|
.align 32
|
||
|
.type _poly1305_init_sse2,@function
|
||
|
.align 16
|
||
|
_poly1305_init_sse2:
|
||
|
movdqu 24(%edi),%xmm4
|
||
|
leal 48(%edi),%edi
|
||
|
movl %esp,%ebp
|
||
|
subl $224,%esp
|
||
|
andl $-16,%esp
|
||
|
movq 64(%ebx),%xmm7
|
||
|
movdqa %xmm4,%xmm0
|
||
|
movdqa %xmm4,%xmm1
|
||
|
movdqa %xmm4,%xmm2
|
||
|
pand %xmm7,%xmm0
|
||
|
psrlq $26,%xmm1
|
||
|
psrldq $6,%xmm2
|
||
|
pand %xmm7,%xmm1
|
||
|
movdqa %xmm2,%xmm3
|
||
|
psrlq $4,%xmm2
|
||
|
psrlq $30,%xmm3
|
||
|
pand %xmm7,%xmm2
|
||
|
pand %xmm7,%xmm3
|
||
|
psrldq $13,%xmm4
|
||
|
leal 144(%esp),%edx
|
||
|
movl $2,%ecx
|
||
|
.L005square:
|
||
|
movdqa %xmm0,(%esp)
|
||
|
movdqa %xmm1,16(%esp)
|
||
|
movdqa %xmm2,32(%esp)
|
||
|
movdqa %xmm3,48(%esp)
|
||
|
movdqa %xmm4,64(%esp)
|
||
|
movdqa %xmm1,%xmm6
|
||
|
movdqa %xmm2,%xmm5
|
||
|
pslld $2,%xmm6
|
||
|
pslld $2,%xmm5
|
||
|
paddd %xmm1,%xmm6
|
||
|
paddd %xmm2,%xmm5
|
||
|
movdqa %xmm6,80(%esp)
|
||
|
movdqa %xmm5,96(%esp)
|
||
|
movdqa %xmm3,%xmm6
|
||
|
movdqa %xmm4,%xmm5
|
||
|
pslld $2,%xmm6
|
||
|
pslld $2,%xmm5
|
||
|
paddd %xmm3,%xmm6
|
||
|
paddd %xmm4,%xmm5
|
||
|
movdqa %xmm6,112(%esp)
|
||
|
movdqa %xmm5,128(%esp)
|
||
|
pshufd $68,%xmm0,%xmm6
|
||
|
movdqa %xmm1,%xmm5
|
||
|
pshufd $68,%xmm1,%xmm1
|
||
|
pshufd $68,%xmm2,%xmm2
|
||
|
pshufd $68,%xmm3,%xmm3
|
||
|
pshufd $68,%xmm4,%xmm4
|
||
|
movdqa %xmm6,(%edx)
|
||
|
movdqa %xmm1,16(%edx)
|
||
|
movdqa %xmm2,32(%edx)
|
||
|
movdqa %xmm3,48(%edx)
|
||
|
movdqa %xmm4,64(%edx)
|
||
|
pmuludq %xmm0,%xmm4
|
||
|
pmuludq %xmm0,%xmm3
|
||
|
pmuludq %xmm0,%xmm2
|
||
|
pmuludq %xmm0,%xmm1
|
||
|
pmuludq %xmm6,%xmm0
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 48(%edx),%xmm5
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 32(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 16(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movdqa 80(%esp),%xmm6
|
||
|
pmuludq (%edx),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
pmuludq 64(%edx),%xmm6
|
||
|
movdqa 32(%esp),%xmm7
|
||
|
paddq %xmm5,%xmm1
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 32(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm0
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 16(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm4
|
||
|
movdqa 96(%esp),%xmm7
|
||
|
pmuludq (%edx),%xmm6
|
||
|
paddq %xmm5,%xmm3
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 64(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm2
|
||
|
pmuludq 48(%edx),%xmm5
|
||
|
movdqa 48(%esp),%xmm6
|
||
|
paddq %xmm7,%xmm1
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 16(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm0
|
||
|
movdqa 112(%esp),%xmm5
|
||
|
pmuludq (%edx),%xmm7
|
||
|
paddq %xmm6,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 64(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm3
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 48(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm2
|
||
|
pmuludq 32(%edx),%xmm7
|
||
|
movdqa 64(%esp),%xmm5
|
||
|
paddq %xmm6,%xmm1
|
||
|
movdqa 128(%esp),%xmm6
|
||
|
pmuludq (%edx),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 64(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 16(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 32(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
pmuludq 48(%edx),%xmm6
|
||
|
movdqa 64(%ebx),%xmm7
|
||
|
paddq %xmm5,%xmm1
|
||
|
paddq %xmm6,%xmm2
|
||
|
movdqa %xmm3,%xmm5
|
||
|
pand %xmm7,%xmm3
|
||
|
psrlq $26,%xmm5
|
||
|
paddq %xmm4,%xmm5
|
||
|
movdqa %xmm0,%xmm6
|
||
|
pand %xmm7,%xmm0
|
||
|
psrlq $26,%xmm6
|
||
|
movdqa %xmm5,%xmm4
|
||
|
paddq %xmm1,%xmm6
|
||
|
psrlq $26,%xmm5
|
||
|
pand %xmm7,%xmm4
|
||
|
movdqa %xmm6,%xmm1
|
||
|
psrlq $26,%xmm6
|
||
|
paddd %xmm5,%xmm0
|
||
|
psllq $2,%xmm5
|
||
|
paddq %xmm2,%xmm6
|
||
|
paddq %xmm0,%xmm5
|
||
|
pand %xmm7,%xmm1
|
||
|
movdqa %xmm6,%xmm2
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm2
|
||
|
paddd %xmm3,%xmm6
|
||
|
movdqa %xmm5,%xmm0
|
||
|
psrlq $26,%xmm5
|
||
|
movdqa %xmm6,%xmm3
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm0
|
||
|
paddd %xmm5,%xmm1
|
||
|
pand %xmm7,%xmm3
|
||
|
paddd %xmm6,%xmm4
|
||
|
decl %ecx
|
||
|
jz .L006square_break
|
||
|
punpcklqdq (%esp),%xmm0
|
||
|
punpcklqdq 16(%esp),%xmm1
|
||
|
punpcklqdq 32(%esp),%xmm2
|
||
|
punpcklqdq 48(%esp),%xmm3
|
||
|
punpcklqdq 64(%esp),%xmm4
|
||
|
jmp .L005square
|
||
|
.L006square_break:
|
||
|
psllq $32,%xmm0
|
||
|
psllq $32,%xmm1
|
||
|
psllq $32,%xmm2
|
||
|
psllq $32,%xmm3
|
||
|
psllq $32,%xmm4
|
||
|
por (%esp),%xmm0
|
||
|
por 16(%esp),%xmm1
|
||
|
por 32(%esp),%xmm2
|
||
|
por 48(%esp),%xmm3
|
||
|
por 64(%esp),%xmm4
|
||
|
pshufd $141,%xmm0,%xmm0
|
||
|
pshufd $141,%xmm1,%xmm1
|
||
|
pshufd $141,%xmm2,%xmm2
|
||
|
pshufd $141,%xmm3,%xmm3
|
||
|
pshufd $141,%xmm4,%xmm4
|
||
|
movdqu %xmm0,(%edi)
|
||
|
movdqu %xmm1,16(%edi)
|
||
|
movdqu %xmm2,32(%edi)
|
||
|
movdqu %xmm3,48(%edi)
|
||
|
movdqu %xmm4,64(%edi)
|
||
|
movdqa %xmm1,%xmm6
|
||
|
movdqa %xmm2,%xmm5
|
||
|
pslld $2,%xmm6
|
||
|
pslld $2,%xmm5
|
||
|
paddd %xmm1,%xmm6
|
||
|
paddd %xmm2,%xmm5
|
||
|
movdqu %xmm6,80(%edi)
|
||
|
movdqu %xmm5,96(%edi)
|
||
|
movdqa %xmm3,%xmm6
|
||
|
movdqa %xmm4,%xmm5
|
||
|
pslld $2,%xmm6
|
||
|
pslld $2,%xmm5
|
||
|
paddd %xmm3,%xmm6
|
||
|
paddd %xmm4,%xmm5
|
||
|
movdqu %xmm6,112(%edi)
|
||
|
movdqu %xmm5,128(%edi)
|
||
|
movl %ebp,%esp
|
||
|
leal -48(%edi),%edi
|
||
|
ret
|
||
|
.size _poly1305_init_sse2,.-_poly1305_init_sse2
|
||
|
.align 32
|
||
|
.type _poly1305_blocks_sse2,@function
|
||
|
.align 16
|
||
|
_poly1305_blocks_sse2:
|
||
|
pushl %ebp
|
||
|
pushl %ebx
|
||
|
pushl %esi
|
||
|
pushl %edi
|
||
|
movl 20(%esp),%edi
|
||
|
movl 24(%esp),%esi
|
||
|
movl 28(%esp),%ecx
|
||
|
movl 20(%edi),%eax
|
||
|
andl $-16,%ecx
|
||
|
jz .L007nodata
|
||
|
cmpl $64,%ecx
|
||
|
jae .L008enter_sse2
|
||
|
testl %eax,%eax
|
||
|
jz .Lenter_blocks
|
||
|
.align 16
|
||
|
.L008enter_sse2:
|
||
|
call .L009pic_point
|
||
|
.L009pic_point:
|
||
|
popl %ebx
|
||
|
leal .Lconst_sse2-.L009pic_point(%ebx),%ebx
|
||
|
testl %eax,%eax
|
||
|
jnz .L010base2_26
|
||
|
call _poly1305_init_sse2
|
||
|
movl (%edi),%eax
|
||
|
movl 3(%edi),%ecx
|
||
|
movl 6(%edi),%edx
|
||
|
movl 9(%edi),%esi
|
||
|
movl 13(%edi),%ebp
|
||
|
movl $1,20(%edi)
|
||
|
shrl $2,%ecx
|
||
|
andl $67108863,%eax
|
||
|
shrl $4,%edx
|
||
|
andl $67108863,%ecx
|
||
|
shrl $6,%esi
|
||
|
andl $67108863,%edx
|
||
|
movd %eax,%xmm0
|
||
|
movd %ecx,%xmm1
|
||
|
movd %edx,%xmm2
|
||
|
movd %esi,%xmm3
|
||
|
movd %ebp,%xmm4
|
||
|
movl 24(%esp),%esi
|
||
|
movl 28(%esp),%ecx
|
||
|
jmp .L011base2_32
|
||
|
.align 16
|
||
|
.L010base2_26:
|
||
|
movd (%edi),%xmm0
|
||
|
movd 4(%edi),%xmm1
|
||
|
movd 8(%edi),%xmm2
|
||
|
movd 12(%edi),%xmm3
|
||
|
movd 16(%edi),%xmm4
|
||
|
movdqa 64(%ebx),%xmm7
|
||
|
.L011base2_32:
|
||
|
movl 32(%esp),%eax
|
||
|
movl %esp,%ebp
|
||
|
subl $528,%esp
|
||
|
andl $-16,%esp
|
||
|
leal 48(%edi),%edi
|
||
|
shll $24,%eax
|
||
|
testl $31,%ecx
|
||
|
jz .L012even
|
||
|
movdqu (%esi),%xmm6
|
||
|
leal 16(%esi),%esi
|
||
|
movdqa %xmm6,%xmm5
|
||
|
pand %xmm7,%xmm6
|
||
|
paddd %xmm6,%xmm0
|
||
|
movdqa %xmm5,%xmm6
|
||
|
psrlq $26,%xmm5
|
||
|
psrldq $6,%xmm6
|
||
|
pand %xmm7,%xmm5
|
||
|
paddd %xmm5,%xmm1
|
||
|
movdqa %xmm6,%xmm5
|
||
|
psrlq $4,%xmm6
|
||
|
pand %xmm7,%xmm6
|
||
|
paddd %xmm6,%xmm2
|
||
|
movdqa %xmm5,%xmm6
|
||
|
psrlq $30,%xmm5
|
||
|
pand %xmm7,%xmm5
|
||
|
psrldq $7,%xmm6
|
||
|
paddd %xmm5,%xmm3
|
||
|
movd %eax,%xmm5
|
||
|
paddd %xmm6,%xmm4
|
||
|
movd 12(%edi),%xmm6
|
||
|
paddd %xmm5,%xmm4
|
||
|
movdqa %xmm0,(%esp)
|
||
|
movdqa %xmm1,16(%esp)
|
||
|
movdqa %xmm2,32(%esp)
|
||
|
movdqa %xmm3,48(%esp)
|
||
|
movdqa %xmm4,64(%esp)
|
||
|
pmuludq %xmm6,%xmm0
|
||
|
pmuludq %xmm6,%xmm1
|
||
|
pmuludq %xmm6,%xmm2
|
||
|
movd 28(%edi),%xmm5
|
||
|
pmuludq %xmm6,%xmm3
|
||
|
pmuludq %xmm6,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 48(%esp),%xmm5
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 32(%esp),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 16(%esp),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movd 92(%edi),%xmm6
|
||
|
pmuludq (%esp),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
pmuludq 64(%esp),%xmm6
|
||
|
movd 44(%edi),%xmm7
|
||
|
paddq %xmm5,%xmm1
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 32(%esp),%xmm7
|
||
|
paddq %xmm6,%xmm0
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 16(%esp),%xmm5
|
||
|
paddq %xmm7,%xmm4
|
||
|
movd 108(%edi),%xmm7
|
||
|
pmuludq (%esp),%xmm6
|
||
|
paddq %xmm5,%xmm3
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 64(%esp),%xmm7
|
||
|
paddq %xmm6,%xmm2
|
||
|
pmuludq 48(%esp),%xmm5
|
||
|
movd 60(%edi),%xmm6
|
||
|
paddq %xmm7,%xmm1
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 16(%esp),%xmm6
|
||
|
paddq %xmm5,%xmm0
|
||
|
movd 124(%edi),%xmm5
|
||
|
pmuludq (%esp),%xmm7
|
||
|
paddq %xmm6,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 64(%esp),%xmm5
|
||
|
paddq %xmm7,%xmm3
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 48(%esp),%xmm6
|
||
|
paddq %xmm5,%xmm2
|
||
|
pmuludq 32(%esp),%xmm7
|
||
|
movd 76(%edi),%xmm5
|
||
|
paddq %xmm6,%xmm1
|
||
|
movd 140(%edi),%xmm6
|
||
|
pmuludq (%esp),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 64(%esp),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 16(%esp),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 32(%esp),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
pmuludq 48(%esp),%xmm6
|
||
|
movdqa 64(%ebx),%xmm7
|
||
|
paddq %xmm5,%xmm1
|
||
|
paddq %xmm6,%xmm2
|
||
|
movdqa %xmm3,%xmm5
|
||
|
pand %xmm7,%xmm3
|
||
|
psrlq $26,%xmm5
|
||
|
paddq %xmm4,%xmm5
|
||
|
movdqa %xmm0,%xmm6
|
||
|
pand %xmm7,%xmm0
|
||
|
psrlq $26,%xmm6
|
||
|
movdqa %xmm5,%xmm4
|
||
|
paddq %xmm1,%xmm6
|
||
|
psrlq $26,%xmm5
|
||
|
pand %xmm7,%xmm4
|
||
|
movdqa %xmm6,%xmm1
|
||
|
psrlq $26,%xmm6
|
||
|
paddd %xmm5,%xmm0
|
||
|
psllq $2,%xmm5
|
||
|
paddq %xmm2,%xmm6
|
||
|
paddq %xmm0,%xmm5
|
||
|
pand %xmm7,%xmm1
|
||
|
movdqa %xmm6,%xmm2
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm2
|
||
|
paddd %xmm3,%xmm6
|
||
|
movdqa %xmm5,%xmm0
|
||
|
psrlq $26,%xmm5
|
||
|
movdqa %xmm6,%xmm3
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm0
|
||
|
paddd %xmm5,%xmm1
|
||
|
pand %xmm7,%xmm3
|
||
|
paddd %xmm6,%xmm4
|
||
|
subl $16,%ecx
|
||
|
jz .L013done
|
||
|
.L012even:
|
||
|
leal 384(%esp),%edx
|
||
|
leal -32(%esi),%eax
|
||
|
subl $64,%ecx
|
||
|
movdqu (%edi),%xmm5
|
||
|
pshufd $68,%xmm5,%xmm6
|
||
|
cmovbl %eax,%esi
|
||
|
pshufd $238,%xmm5,%xmm5
|
||
|
movdqa %xmm6,(%edx)
|
||
|
leal 160(%esp),%eax
|
||
|
movdqu 16(%edi),%xmm6
|
||
|
movdqa %xmm5,-144(%edx)
|
||
|
pshufd $68,%xmm6,%xmm5
|
||
|
pshufd $238,%xmm6,%xmm6
|
||
|
movdqa %xmm5,16(%edx)
|
||
|
movdqu 32(%edi),%xmm5
|
||
|
movdqa %xmm6,-128(%edx)
|
||
|
pshufd $68,%xmm5,%xmm6
|
||
|
pshufd $238,%xmm5,%xmm5
|
||
|
movdqa %xmm6,32(%edx)
|
||
|
movdqu 48(%edi),%xmm6
|
||
|
movdqa %xmm5,-112(%edx)
|
||
|
pshufd $68,%xmm6,%xmm5
|
||
|
pshufd $238,%xmm6,%xmm6
|
||
|
movdqa %xmm5,48(%edx)
|
||
|
movdqu 64(%edi),%xmm5
|
||
|
movdqa %xmm6,-96(%edx)
|
||
|
pshufd $68,%xmm5,%xmm6
|
||
|
pshufd $238,%xmm5,%xmm5
|
||
|
movdqa %xmm6,64(%edx)
|
||
|
movdqu 80(%edi),%xmm6
|
||
|
movdqa %xmm5,-80(%edx)
|
||
|
pshufd $68,%xmm6,%xmm5
|
||
|
pshufd $238,%xmm6,%xmm6
|
||
|
movdqa %xmm5,80(%edx)
|
||
|
movdqu 96(%edi),%xmm5
|
||
|
movdqa %xmm6,-64(%edx)
|
||
|
pshufd $68,%xmm5,%xmm6
|
||
|
pshufd $238,%xmm5,%xmm5
|
||
|
movdqa %xmm6,96(%edx)
|
||
|
movdqu 112(%edi),%xmm6
|
||
|
movdqa %xmm5,-48(%edx)
|
||
|
pshufd $68,%xmm6,%xmm5
|
||
|
pshufd $238,%xmm6,%xmm6
|
||
|
movdqa %xmm5,112(%edx)
|
||
|
movdqu 128(%edi),%xmm5
|
||
|
movdqa %xmm6,-32(%edx)
|
||
|
pshufd $68,%xmm5,%xmm6
|
||
|
pshufd $238,%xmm5,%xmm5
|
||
|
movdqa %xmm6,128(%edx)
|
||
|
movdqa %xmm5,-16(%edx)
|
||
|
movdqu 32(%esi),%xmm5
|
||
|
movdqu 48(%esi),%xmm6
|
||
|
leal 32(%esi),%esi
|
||
|
movdqa %xmm2,112(%esp)
|
||
|
movdqa %xmm3,128(%esp)
|
||
|
movdqa %xmm4,144(%esp)
|
||
|
movdqa %xmm5,%xmm2
|
||
|
movdqa %xmm6,%xmm3
|
||
|
psrldq $6,%xmm2
|
||
|
psrldq $6,%xmm3
|
||
|
movdqa %xmm5,%xmm4
|
||
|
punpcklqdq %xmm3,%xmm2
|
||
|
punpckhqdq %xmm6,%xmm4
|
||
|
punpcklqdq %xmm6,%xmm5
|
||
|
movdqa %xmm2,%xmm3
|
||
|
psrlq $4,%xmm2
|
||
|
psrlq $30,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
psrlq $40,%xmm4
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm5
|
||
|
pand %xmm7,%xmm6
|
||
|
pand %xmm7,%xmm2
|
||
|
pand %xmm7,%xmm3
|
||
|
por (%ebx),%xmm4
|
||
|
movdqa %xmm0,80(%esp)
|
||
|
movdqa %xmm1,96(%esp)
|
||
|
jbe .L014skip_loop
|
||
|
jmp .L015loop
|
||
|
.align 32
|
||
|
.L015loop:
|
||
|
movdqa -144(%edx),%xmm7
|
||
|
movdqa %xmm6,16(%eax)
|
||
|
movdqa %xmm2,32(%eax)
|
||
|
movdqa %xmm3,48(%eax)
|
||
|
movdqa %xmm4,64(%eax)
|
||
|
movdqa %xmm5,%xmm1
|
||
|
pmuludq %xmm7,%xmm5
|
||
|
movdqa %xmm6,%xmm0
|
||
|
pmuludq %xmm7,%xmm6
|
||
|
pmuludq %xmm7,%xmm2
|
||
|
pmuludq %xmm7,%xmm3
|
||
|
pmuludq %xmm7,%xmm4
|
||
|
pmuludq -16(%edx),%xmm0
|
||
|
movdqa %xmm1,%xmm7
|
||
|
pmuludq -128(%edx),%xmm1
|
||
|
paddq %xmm5,%xmm0
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq -112(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm1
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq -96(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
movdqa 16(%eax),%xmm7
|
||
|
pmuludq -80(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm3
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq -128(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq -112(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
movdqa 32(%eax),%xmm7
|
||
|
pmuludq -96(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm3
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq -32(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq -16(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq -128(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm1
|
||
|
movdqa 48(%eax),%xmm5
|
||
|
pmuludq -112(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq -48(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm4
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq -32(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm0
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq -16(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm1
|
||
|
movdqa 64(%eax),%xmm6
|
||
|
pmuludq -128(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq -16(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq -64(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq -48(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
movdqa 64(%ebx),%xmm7
|
||
|
pmuludq -32(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm1
|
||
|
paddq %xmm6,%xmm2
|
||
|
movdqu -32(%esi),%xmm5
|
||
|
movdqu -16(%esi),%xmm6
|
||
|
leal 32(%esi),%esi
|
||
|
movdqa %xmm2,32(%esp)
|
||
|
movdqa %xmm3,48(%esp)
|
||
|
movdqa %xmm4,64(%esp)
|
||
|
movdqa %xmm5,%xmm2
|
||
|
movdqa %xmm6,%xmm3
|
||
|
psrldq $6,%xmm2
|
||
|
psrldq $6,%xmm3
|
||
|
movdqa %xmm5,%xmm4
|
||
|
punpcklqdq %xmm3,%xmm2
|
||
|
punpckhqdq %xmm6,%xmm4
|
||
|
punpcklqdq %xmm6,%xmm5
|
||
|
movdqa %xmm2,%xmm3
|
||
|
psrlq $4,%xmm2
|
||
|
psrlq $30,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
psrlq $40,%xmm4
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm5
|
||
|
pand %xmm7,%xmm6
|
||
|
pand %xmm7,%xmm2
|
||
|
pand %xmm7,%xmm3
|
||
|
por (%ebx),%xmm4
|
||
|
leal -32(%esi),%eax
|
||
|
subl $64,%ecx
|
||
|
paddd 80(%esp),%xmm5
|
||
|
paddd 96(%esp),%xmm6
|
||
|
paddd 112(%esp),%xmm2
|
||
|
paddd 128(%esp),%xmm3
|
||
|
paddd 144(%esp),%xmm4
|
||
|
cmovbl %eax,%esi
|
||
|
leal 160(%esp),%eax
|
||
|
movdqa (%edx),%xmm7
|
||
|
movdqa %xmm1,16(%esp)
|
||
|
movdqa %xmm6,16(%eax)
|
||
|
movdqa %xmm2,32(%eax)
|
||
|
movdqa %xmm3,48(%eax)
|
||
|
movdqa %xmm4,64(%eax)
|
||
|
movdqa %xmm5,%xmm1
|
||
|
pmuludq %xmm7,%xmm5
|
||
|
paddq %xmm0,%xmm5
|
||
|
movdqa %xmm6,%xmm0
|
||
|
pmuludq %xmm7,%xmm6
|
||
|
pmuludq %xmm7,%xmm2
|
||
|
pmuludq %xmm7,%xmm3
|
||
|
pmuludq %xmm7,%xmm4
|
||
|
paddq 16(%esp),%xmm6
|
||
|
paddq 32(%esp),%xmm2
|
||
|
paddq 48(%esp),%xmm3
|
||
|
paddq 64(%esp),%xmm4
|
||
|
pmuludq 128(%edx),%xmm0
|
||
|
movdqa %xmm1,%xmm7
|
||
|
pmuludq 16(%edx),%xmm1
|
||
|
paddq %xmm5,%xmm0
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 32(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm1
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 48(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
movdqa 16(%eax),%xmm7
|
||
|
pmuludq 64(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm3
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 16(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 32(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
movdqa 32(%eax),%xmm7
|
||
|
pmuludq 48(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm3
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 112(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 128(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 16(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm1
|
||
|
movdqa 48(%eax),%xmm5
|
||
|
pmuludq 32(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 96(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm4
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 112(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm0
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 128(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm1
|
||
|
movdqa 64(%eax),%xmm6
|
||
|
pmuludq 16(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 128(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 80(%edx),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 96(%edx),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
movdqa 64(%ebx),%xmm7
|
||
|
pmuludq 112(%edx),%xmm6
|
||
|
paddq %xmm5,%xmm1
|
||
|
paddq %xmm6,%xmm2
|
||
|
movdqa %xmm3,%xmm5
|
||
|
pand %xmm7,%xmm3
|
||
|
psrlq $26,%xmm5
|
||
|
paddq %xmm4,%xmm5
|
||
|
movdqa %xmm0,%xmm6
|
||
|
pand %xmm7,%xmm0
|
||
|
psrlq $26,%xmm6
|
||
|
movdqa %xmm5,%xmm4
|
||
|
paddq %xmm1,%xmm6
|
||
|
psrlq $26,%xmm5
|
||
|
pand %xmm7,%xmm4
|
||
|
movdqa %xmm6,%xmm1
|
||
|
psrlq $26,%xmm6
|
||
|
paddd %xmm5,%xmm0
|
||
|
psllq $2,%xmm5
|
||
|
paddq %xmm2,%xmm6
|
||
|
paddq %xmm0,%xmm5
|
||
|
pand %xmm7,%xmm1
|
||
|
movdqa %xmm6,%xmm2
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm2
|
||
|
paddd %xmm3,%xmm6
|
||
|
movdqa %xmm5,%xmm0
|
||
|
psrlq $26,%xmm5
|
||
|
movdqa %xmm6,%xmm3
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm0
|
||
|
paddd %xmm5,%xmm1
|
||
|
pand %xmm7,%xmm3
|
||
|
paddd %xmm6,%xmm4
|
||
|
movdqu 32(%esi),%xmm5
|
||
|
movdqu 48(%esi),%xmm6
|
||
|
leal 32(%esi),%esi
|
||
|
movdqa %xmm2,112(%esp)
|
||
|
movdqa %xmm3,128(%esp)
|
||
|
movdqa %xmm4,144(%esp)
|
||
|
movdqa %xmm5,%xmm2
|
||
|
movdqa %xmm6,%xmm3
|
||
|
psrldq $6,%xmm2
|
||
|
psrldq $6,%xmm3
|
||
|
movdqa %xmm5,%xmm4
|
||
|
punpcklqdq %xmm3,%xmm2
|
||
|
punpckhqdq %xmm6,%xmm4
|
||
|
punpcklqdq %xmm6,%xmm5
|
||
|
movdqa %xmm2,%xmm3
|
||
|
psrlq $4,%xmm2
|
||
|
psrlq $30,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
psrlq $40,%xmm4
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm5
|
||
|
pand %xmm7,%xmm6
|
||
|
pand %xmm7,%xmm2
|
||
|
pand %xmm7,%xmm3
|
||
|
por (%ebx),%xmm4
|
||
|
movdqa %xmm0,80(%esp)
|
||
|
movdqa %xmm1,96(%esp)
|
||
|
ja .L015loop
|
||
|
.L014skip_loop:
|
||
|
pshufd $16,-144(%edx),%xmm7
|
||
|
addl $32,%ecx
|
||
|
jnz .L016long_tail
|
||
|
paddd %xmm0,%xmm5
|
||
|
paddd %xmm1,%xmm6
|
||
|
paddd 112(%esp),%xmm2
|
||
|
paddd 128(%esp),%xmm3
|
||
|
paddd 144(%esp),%xmm4
|
||
|
.L016long_tail:
|
||
|
movdqa %xmm5,(%eax)
|
||
|
movdqa %xmm6,16(%eax)
|
||
|
movdqa %xmm2,32(%eax)
|
||
|
movdqa %xmm3,48(%eax)
|
||
|
movdqa %xmm4,64(%eax)
|
||
|
pmuludq %xmm7,%xmm5
|
||
|
pmuludq %xmm7,%xmm6
|
||
|
pmuludq %xmm7,%xmm2
|
||
|
movdqa %xmm5,%xmm0
|
||
|
pshufd $16,-128(%edx),%xmm5
|
||
|
pmuludq %xmm7,%xmm3
|
||
|
movdqa %xmm6,%xmm1
|
||
|
pmuludq %xmm7,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 48(%eax),%xmm5
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 32(%eax),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 16(%eax),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
pshufd $16,-64(%edx),%xmm6
|
||
|
pmuludq (%eax),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
pmuludq 64(%eax),%xmm6
|
||
|
pshufd $16,-112(%edx),%xmm7
|
||
|
paddq %xmm5,%xmm1
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 32(%eax),%xmm7
|
||
|
paddq %xmm6,%xmm0
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 16(%eax),%xmm5
|
||
|
paddq %xmm7,%xmm4
|
||
|
pshufd $16,-48(%edx),%xmm7
|
||
|
pmuludq (%eax),%xmm6
|
||
|
paddq %xmm5,%xmm3
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 64(%eax),%xmm7
|
||
|
paddq %xmm6,%xmm2
|
||
|
pmuludq 48(%eax),%xmm5
|
||
|
pshufd $16,-96(%edx),%xmm6
|
||
|
paddq %xmm7,%xmm1
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 16(%eax),%xmm6
|
||
|
paddq %xmm5,%xmm0
|
||
|
pshufd $16,-32(%edx),%xmm5
|
||
|
pmuludq (%eax),%xmm7
|
||
|
paddq %xmm6,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 64(%eax),%xmm5
|
||
|
paddq %xmm7,%xmm3
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 48(%eax),%xmm6
|
||
|
paddq %xmm5,%xmm2
|
||
|
pmuludq 32(%eax),%xmm7
|
||
|
pshufd $16,-80(%edx),%xmm5
|
||
|
paddq %xmm6,%xmm1
|
||
|
pshufd $16,-16(%edx),%xmm6
|
||
|
pmuludq (%eax),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 64(%eax),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 16(%eax),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 32(%eax),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
pmuludq 48(%eax),%xmm6
|
||
|
movdqa 64(%ebx),%xmm7
|
||
|
paddq %xmm5,%xmm1
|
||
|
paddq %xmm6,%xmm2
|
||
|
jz .L017short_tail
|
||
|
movdqu -32(%esi),%xmm5
|
||
|
movdqu -16(%esi),%xmm6
|
||
|
leal 32(%esi),%esi
|
||
|
movdqa %xmm2,32(%esp)
|
||
|
movdqa %xmm3,48(%esp)
|
||
|
movdqa %xmm4,64(%esp)
|
||
|
movdqa %xmm5,%xmm2
|
||
|
movdqa %xmm6,%xmm3
|
||
|
psrldq $6,%xmm2
|
||
|
psrldq $6,%xmm3
|
||
|
movdqa %xmm5,%xmm4
|
||
|
punpcklqdq %xmm3,%xmm2
|
||
|
punpckhqdq %xmm6,%xmm4
|
||
|
punpcklqdq %xmm6,%xmm5
|
||
|
movdqa %xmm2,%xmm3
|
||
|
psrlq $4,%xmm2
|
||
|
psrlq $30,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
psrlq $40,%xmm4
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm5
|
||
|
pand %xmm7,%xmm6
|
||
|
pand %xmm7,%xmm2
|
||
|
pand %xmm7,%xmm3
|
||
|
por (%ebx),%xmm4
|
||
|
pshufd $16,(%edx),%xmm7
|
||
|
paddd 80(%esp),%xmm5
|
||
|
paddd 96(%esp),%xmm6
|
||
|
paddd 112(%esp),%xmm2
|
||
|
paddd 128(%esp),%xmm3
|
||
|
paddd 144(%esp),%xmm4
|
||
|
movdqa %xmm5,(%esp)
|
||
|
pmuludq %xmm7,%xmm5
|
||
|
movdqa %xmm6,16(%esp)
|
||
|
pmuludq %xmm7,%xmm6
|
||
|
paddq %xmm5,%xmm0
|
||
|
movdqa %xmm2,%xmm5
|
||
|
pmuludq %xmm7,%xmm2
|
||
|
paddq %xmm6,%xmm1
|
||
|
movdqa %xmm3,%xmm6
|
||
|
pmuludq %xmm7,%xmm3
|
||
|
paddq 32(%esp),%xmm2
|
||
|
movdqa %xmm5,32(%esp)
|
||
|
pshufd $16,16(%edx),%xmm5
|
||
|
paddq 48(%esp),%xmm3
|
||
|
movdqa %xmm6,48(%esp)
|
||
|
movdqa %xmm4,%xmm6
|
||
|
pmuludq %xmm7,%xmm4
|
||
|
paddq 64(%esp),%xmm4
|
||
|
movdqa %xmm6,64(%esp)
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 48(%esp),%xmm5
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 32(%esp),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 16(%esp),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
pshufd $16,80(%edx),%xmm6
|
||
|
pmuludq (%esp),%xmm5
|
||
|
paddq %xmm7,%xmm2
|
||
|
pmuludq 64(%esp),%xmm6
|
||
|
pshufd $16,32(%edx),%xmm7
|
||
|
paddq %xmm5,%xmm1
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 32(%esp),%xmm7
|
||
|
paddq %xmm6,%xmm0
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 16(%esp),%xmm5
|
||
|
paddq %xmm7,%xmm4
|
||
|
pshufd $16,96(%edx),%xmm7
|
||
|
pmuludq (%esp),%xmm6
|
||
|
paddq %xmm5,%xmm3
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 64(%esp),%xmm7
|
||
|
paddq %xmm6,%xmm2
|
||
|
pmuludq 48(%esp),%xmm5
|
||
|
pshufd $16,48(%edx),%xmm6
|
||
|
paddq %xmm7,%xmm1
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 16(%esp),%xmm6
|
||
|
paddq %xmm5,%xmm0
|
||
|
pshufd $16,112(%edx),%xmm5
|
||
|
pmuludq (%esp),%xmm7
|
||
|
paddq %xmm6,%xmm4
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 64(%esp),%xmm5
|
||
|
paddq %xmm7,%xmm3
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 48(%esp),%xmm6
|
||
|
paddq %xmm5,%xmm2
|
||
|
pmuludq 32(%esp),%xmm7
|
||
|
pshufd $16,64(%edx),%xmm5
|
||
|
paddq %xmm6,%xmm1
|
||
|
pshufd $16,128(%edx),%xmm6
|
||
|
pmuludq (%esp),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
movdqa %xmm6,%xmm7
|
||
|
pmuludq 64(%esp),%xmm6
|
||
|
paddq %xmm5,%xmm4
|
||
|
movdqa %xmm7,%xmm5
|
||
|
pmuludq 16(%esp),%xmm7
|
||
|
paddq %xmm6,%xmm3
|
||
|
movdqa %xmm5,%xmm6
|
||
|
pmuludq 32(%esp),%xmm5
|
||
|
paddq %xmm7,%xmm0
|
||
|
pmuludq 48(%esp),%xmm6
|
||
|
movdqa 64(%ebx),%xmm7
|
||
|
paddq %xmm5,%xmm1
|
||
|
paddq %xmm6,%xmm2
|
||
|
.L017short_tail:
|
||
|
pshufd $78,%xmm4,%xmm6
|
||
|
pshufd $78,%xmm3,%xmm5
|
||
|
paddq %xmm6,%xmm4
|
||
|
paddq %xmm5,%xmm3
|
||
|
pshufd $78,%xmm0,%xmm6
|
||
|
pshufd $78,%xmm1,%xmm5
|
||
|
paddq %xmm6,%xmm0
|
||
|
paddq %xmm5,%xmm1
|
||
|
pshufd $78,%xmm2,%xmm6
|
||
|
movdqa %xmm3,%xmm5
|
||
|
pand %xmm7,%xmm3
|
||
|
psrlq $26,%xmm5
|
||
|
paddq %xmm6,%xmm2
|
||
|
paddq %xmm4,%xmm5
|
||
|
movdqa %xmm0,%xmm6
|
||
|
pand %xmm7,%xmm0
|
||
|
psrlq $26,%xmm6
|
||
|
movdqa %xmm5,%xmm4
|
||
|
paddq %xmm1,%xmm6
|
||
|
psrlq $26,%xmm5
|
||
|
pand %xmm7,%xmm4
|
||
|
movdqa %xmm6,%xmm1
|
||
|
psrlq $26,%xmm6
|
||
|
paddd %xmm5,%xmm0
|
||
|
psllq $2,%xmm5
|
||
|
paddq %xmm2,%xmm6
|
||
|
paddq %xmm0,%xmm5
|
||
|
pand %xmm7,%xmm1
|
||
|
movdqa %xmm6,%xmm2
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm2
|
||
|
paddd %xmm3,%xmm6
|
||
|
movdqa %xmm5,%xmm0
|
||
|
psrlq $26,%xmm5
|
||
|
movdqa %xmm6,%xmm3
|
||
|
psrlq $26,%xmm6
|
||
|
pand %xmm7,%xmm0
|
||
|
paddd %xmm5,%xmm1
|
||
|
pand %xmm7,%xmm3
|
||
|
paddd %xmm6,%xmm4
|
||
|
.L013done:
|
||
|
movd %xmm0,-48(%edi)
|
||
|
movd %xmm1,-44(%edi)
|
||
|
movd %xmm2,-40(%edi)
|
||
|
movd %xmm3,-36(%edi)
|
||
|
movd %xmm4,-32(%edi)
|
||
|
movl %ebp,%esp
|
||
|
.L007nodata:
|
||
|
popl %edi
|
||
|
popl %esi
|
||
|
popl %ebx
|
||
|
popl %ebp
|
||
|
ret
|
||
|
.size _poly1305_blocks_sse2,.-_poly1305_blocks_sse2
|
||
|
.align 32
|
||
|
.type _poly1305_emit_sse2,@function
|
||
|
.align 16
|
||
|
_poly1305_emit_sse2:
|
||
|
pushl %ebp
|
||
|
pushl %ebx
|
||
|
pushl %esi
|
||
|
pushl %edi
|
||
|
movl 20(%esp),%ebp
|
||
|
cmpl $0,20(%ebp)
|
||
|
je .Lenter_emit
|
||
|
movl (%ebp),%eax
|
||
|
movl 4(%ebp),%edi
|
||
|
movl 8(%ebp),%ecx
|
||
|
movl 12(%ebp),%edx
|
||
|
movl 16(%ebp),%esi
|
||
|
movl %edi,%ebx
|
||
|
shll $26,%edi
|
||
|
shrl $6,%ebx
|
||
|
addl %edi,%eax
|
||
|
movl %ecx,%edi
|
||
|
adcl $0,%ebx
|
||
|
shll $20,%edi
|
||
|
shrl $12,%ecx
|
||
|
addl %edi,%ebx
|
||
|
movl %edx,%edi
|
||
|
adcl $0,%ecx
|
||
|
shll $14,%edi
|
||
|
shrl $18,%edx
|
||
|
addl %edi,%ecx
|
||
|
movl %esi,%edi
|
||
|
adcl $0,%edx
|
||
|
shll $8,%edi
|
||
|
shrl $24,%esi
|
||
|
addl %edi,%edx
|
||
|
adcl $0,%esi
|
||
|
movl %esi,%edi
|
||
|
andl $3,%esi
|
||
|
shrl $2,%edi
|
||
|
leal (%edi,%edi,4),%ebp
|
||
|
movl 24(%esp),%edi
|
||
|
addl %ebp,%eax
|
||
|
movl 28(%esp),%ebp
|
||
|
adcl $0,%ebx
|
||
|
adcl $0,%ecx
|
||
|
adcl $0,%edx
|
||
|
adcl $0,%esi
|
||
|
movd %eax,%xmm0
|
||
|
addl $5,%eax
|
||
|
movd %ebx,%xmm1
|
||
|
adcl $0,%ebx
|
||
|
movd %ecx,%xmm2
|
||
|
adcl $0,%ecx
|
||
|
movd %edx,%xmm3
|
||
|
adcl $0,%edx
|
||
|
adcl $0,%esi
|
||
|
shrl $2,%esi
|
||
|
negl %esi
|
||
|
andl %esi,%eax
|
||
|
andl %esi,%ebx
|
||
|
andl %esi,%ecx
|
||
|
andl %esi,%edx
|
||
|
movl %eax,(%edi)
|
||
|
movd %xmm0,%eax
|
||
|
movl %ebx,4(%edi)
|
||
|
movd %xmm1,%ebx
|
||
|
movl %ecx,8(%edi)
|
||
|
movd %xmm2,%ecx
|
||
|
movl %edx,12(%edi)
|
||
|
movd %xmm3,%edx
|
||
|
notl %esi
|
||
|
andl %esi,%eax
|
||
|
andl %esi,%ebx
|
||
|
orl (%edi),%eax
|
||
|
andl %esi,%ecx
|
||
|
orl 4(%edi),%ebx
|
||
|
andl %esi,%edx
|
||
|
orl 8(%edi),%ecx
|
||
|
orl 12(%edi),%edx
|
||
|
addl (%ebp),%eax
|
||
|
adcl 4(%ebp),%ebx
|
||
|
movl %eax,(%edi)
|
||
|
adcl 8(%ebp),%ecx
|
||
|
movl %ebx,4(%edi)
|
||
|
adcl 12(%ebp),%edx
|
||
|
movl %ecx,8(%edi)
|
||
|
movl %edx,12(%edi)
|
||
|
popl %edi
|
||
|
popl %esi
|
||
|
popl %ebx
|
||
|
popl %ebp
|
||
|
ret
|
||
|
.size _poly1305_emit_sse2,.-_poly1305_emit_sse2
|
||
|
.align 32
|
||
|
.type _poly1305_init_avx2,@function
|
||
|
.align 16
|
||
|
_poly1305_init_avx2:
|
||
|
vmovdqu 24(%edi),%xmm4
|
||
|
leal 48(%edi),%edi
|
||
|
movl %esp,%ebp
|
||
|
subl $224,%esp
|
||
|
andl $-16,%esp
|
||
|
vmovdqa 64(%ebx),%xmm7
|
||
|
vpand %xmm7,%xmm4,%xmm0
|
||
|
vpsrlq $26,%xmm4,%xmm1
|
||
|
vpsrldq $6,%xmm4,%xmm3
|
||
|
vpand %xmm7,%xmm1,%xmm1
|
||
|
vpsrlq $4,%xmm3,%xmm2
|
||
|
vpsrlq $30,%xmm3,%xmm3
|
||
|
vpand %xmm7,%xmm2,%xmm2
|
||
|
vpand %xmm7,%xmm3,%xmm3
|
||
|
vpsrldq $13,%xmm4,%xmm4
|
||
|
leal 144(%esp),%edx
|
||
|
movl $2,%ecx
|
||
|
.L018square:
|
||
|
vmovdqa %xmm0,(%esp)
|
||
|
vmovdqa %xmm1,16(%esp)
|
||
|
vmovdqa %xmm2,32(%esp)
|
||
|
vmovdqa %xmm3,48(%esp)
|
||
|
vmovdqa %xmm4,64(%esp)
|
||
|
vpslld $2,%xmm1,%xmm6
|
||
|
vpslld $2,%xmm2,%xmm5
|
||
|
vpaddd %xmm1,%xmm6,%xmm6
|
||
|
vpaddd %xmm2,%xmm5,%xmm5
|
||
|
vmovdqa %xmm6,80(%esp)
|
||
|
vmovdqa %xmm5,96(%esp)
|
||
|
vpslld $2,%xmm3,%xmm6
|
||
|
vpslld $2,%xmm4,%xmm5
|
||
|
vpaddd %xmm3,%xmm6,%xmm6
|
||
|
vpaddd %xmm4,%xmm5,%xmm5
|
||
|
vmovdqa %xmm6,112(%esp)
|
||
|
vmovdqa %xmm5,128(%esp)
|
||
|
vpshufd $68,%xmm0,%xmm5
|
||
|
vmovdqa %xmm1,%xmm6
|
||
|
vpshufd $68,%xmm1,%xmm1
|
||
|
vpshufd $68,%xmm2,%xmm2
|
||
|
vpshufd $68,%xmm3,%xmm3
|
||
|
vpshufd $68,%xmm4,%xmm4
|
||
|
vmovdqa %xmm5,(%edx)
|
||
|
vmovdqa %xmm1,16(%edx)
|
||
|
vmovdqa %xmm2,32(%edx)
|
||
|
vmovdqa %xmm3,48(%edx)
|
||
|
vmovdqa %xmm4,64(%edx)
|
||
|
vpmuludq %xmm0,%xmm4,%xmm4
|
||
|
vpmuludq %xmm0,%xmm3,%xmm3
|
||
|
vpmuludq %xmm0,%xmm2,%xmm2
|
||
|
vpmuludq %xmm0,%xmm1,%xmm1
|
||
|
vpmuludq %xmm0,%xmm5,%xmm0
|
||
|
vpmuludq 48(%edx),%xmm6,%xmm5
|
||
|
vpaddq %xmm5,%xmm4,%xmm4
|
||
|
vpmuludq 32(%edx),%xmm6,%xmm7
|
||
|
vpaddq %xmm7,%xmm3,%xmm3
|
||
|
vpmuludq 16(%edx),%xmm6,%xmm5
|
||
|
vpaddq %xmm5,%xmm2,%xmm2
|
||
|
vmovdqa 80(%esp),%xmm7
|
||
|
vpmuludq (%edx),%xmm6,%xmm6
|
||
|
vpaddq %xmm6,%xmm1,%xmm1
|
||
|
vmovdqa 32(%esp),%xmm5
|
||
|
vpmuludq 64(%edx),%xmm7,%xmm7
|
||
|
vpaddq %xmm7,%xmm0,%xmm0
|
||
|
vpmuludq 32(%edx),%xmm5,%xmm6
|
||
|
vpaddq %xmm6,%xmm4,%xmm4
|
||
|
vpmuludq 16(%edx),%xmm5,%xmm7
|
||
|
vpaddq %xmm7,%xmm3,%xmm3
|
||
|
vmovdqa 96(%esp),%xmm6
|
||
|
vpmuludq (%edx),%xmm5,%xmm5
|
||
|
vpaddq %xmm5,%xmm2,%xmm2
|
||
|
vpmuludq 64(%edx),%xmm6,%xmm7
|
||
|
vpaddq %xmm7,%xmm1,%xmm1
|
||
|
vmovdqa 48(%esp),%xmm5
|
||
|
vpmuludq 48(%edx),%xmm6,%xmm6
|
||
|
vpaddq %xmm6,%xmm0,%xmm0
|
||
|
vpmuludq 16(%edx),%xmm5,%xmm7
|
||
|
vpaddq %xmm7,%xmm4,%xmm4
|
||
|
vmovdqa 112(%esp),%xmm6
|
||
|
vpmuludq (%edx),%xmm5,%xmm5
|
||
|
vpaddq %xmm5,%xmm3,%xmm3
|
||
|
vpmuludq 64(%edx),%xmm6,%xmm7
|
||
|
vpaddq %xmm7,%xmm2,%xmm2
|
||
|
vpmuludq 48(%edx),%xmm6,%xmm5
|
||
|
vpaddq %xmm5,%xmm1,%xmm1
|
||
|
vmovdqa 64(%esp),%xmm7
|
||
|
vpmuludq 32(%edx),%xmm6,%xmm6
|
||
|
vpaddq %xmm6,%xmm0,%xmm0
|
||
|
vmovdqa 128(%esp),%xmm5
|
||
|
vpmuludq (%edx),%xmm7,%xmm7
|
||
|
vpaddq %xmm7,%xmm4,%xmm4
|
||
|
vpmuludq 64(%edx),%xmm5,%xmm6
|
||
|
vpaddq %xmm6,%xmm3,%xmm3
|
||
|
vpmuludq 16(%edx),%xmm5,%xmm7
|
||
|
vpaddq %xmm7,%xmm0,%xmm0
|
||
|
vpmuludq 32(%edx),%xmm5,%xmm6
|
||
|
vpaddq %xmm6,%xmm1,%xmm1
|
||
|
vmovdqa 64(%ebx),%xmm7
|
||
|
vpmuludq 48(%edx),%xmm5,%xmm5
|
||
|
vpaddq %xmm5,%xmm2,%xmm2
|
||
|
vpsrlq $26,%xmm3,%xmm5
|
||
|
vpand %xmm7,%xmm3,%xmm3
|
||
|
vpsrlq $26,%xmm0,%xmm6
|
||
|
vpand %xmm7,%xmm0,%xmm0
|
||
|
vpaddq %xmm5,%xmm4,%xmm4
|
||
|
vpaddq %xmm6,%xmm1,%xmm1
|
||
|
vpsrlq $26,%xmm4,%xmm5
|
||
|
vpand %xmm7,%xmm4,%xmm4
|
||
|
vpsrlq $26,%xmm1,%xmm6
|
||
|
vpand %xmm7,%xmm1,%xmm1
|
||
|
vpaddq %xmm6,%xmm2,%xmm2
|
||
|
vpaddd %xmm5,%xmm0,%xmm0
|
||
|
vpsllq $2,%xmm5,%xmm5
|
||
|
vpsrlq $26,%xmm2,%xmm6
|
||
|
vpand %xmm7,%xmm2,%xmm2
|
||
|
vpaddd %xmm5,%xmm0,%xmm0
|
||
|
vpaddd %xmm6,%xmm3,%xmm3
|
||
|
vpsrlq $26,%xmm3,%xmm6
|
||
|
vpsrlq $26,%xmm0,%xmm5
|
||
|
vpand %xmm7,%xmm0,%xmm0
|
||
|
vpand %xmm7,%xmm3,%xmm3
|
||
|
vpaddd %xmm5,%xmm1,%xmm1
|
||
|
vpaddd %xmm6,%xmm4,%xmm4
|
||
|
decl %ecx
|
||
|
jz .L019square_break
|
||
|
vpunpcklqdq (%esp),%xmm0,%xmm0
|
||
|
vpunpcklqdq 16(%esp),%xmm1,%xmm1
|
||
|
vpunpcklqdq 32(%esp),%xmm2,%xmm2
|
||
|
vpunpcklqdq 48(%esp),%xmm3,%xmm3
|
||
|
vpunpcklqdq 64(%esp),%xmm4,%xmm4
|
||
|
jmp .L018square
|
||
|
.L019square_break:
|
||
|
vpsllq $32,%xmm0,%xmm0
|
||
|
vpsllq $32,%xmm1,%xmm1
|
||
|
vpsllq $32,%xmm2,%xmm2
|
||
|
vpsllq $32,%xmm3,%xmm3
|
||
|
vpsllq $32,%xmm4,%xmm4
|
||
|
vpor (%esp),%xmm0,%xmm0
|
||
|
vpor 16(%esp),%xmm1,%xmm1
|
||
|
vpor 32(%esp),%xmm2,%xmm2
|
||
|
vpor 48(%esp),%xmm3,%xmm3
|
||
|
vpor 64(%esp),%xmm4,%xmm4
|
||
|
vpshufd $141,%xmm0,%xmm0
|
||
|
vpshufd $141,%xmm1,%xmm1
|
||
|
vpshufd $141,%xmm2,%xmm2
|
||
|
vpshufd $141,%xmm3,%xmm3
|
||
|
vpshufd $141,%xmm4,%xmm4
|
||
|
vmovdqu %xmm0,(%edi)
|
||
|
vmovdqu %xmm1,16(%edi)
|
||
|
vmovdqu %xmm2,32(%edi)
|
||
|
vmovdqu %xmm3,48(%edi)
|
||
|
vmovdqu %xmm4,64(%edi)
|
||
|
vpslld $2,%xmm1,%xmm6
|
||
|
vpslld $2,%xmm2,%xmm5
|
||
|
vpaddd %xmm1,%xmm6,%xmm6
|
||
|
vpaddd %xmm2,%xmm5,%xmm5
|
||
|
vmovdqu %xmm6,80(%edi)
|
||
|
vmovdqu %xmm5,96(%edi)
|
||
|
vpslld $2,%xmm3,%xmm6
|
||
|
vpslld $2,%xmm4,%xmm5
|
||
|
vpaddd %xmm3,%xmm6,%xmm6
|
||
|
vpaddd %xmm4,%xmm5,%xmm5
|
||
|
vmovdqu %xmm6,112(%edi)
|
||
|
vmovdqu %xmm5,128(%edi)
|
||
|
movl %ebp,%esp
|
||
|
leal -48(%edi),%edi
|
||
|
ret
|
||
|
.size _poly1305_init_avx2,.-_poly1305_init_avx2
|
||
|
.align 32
|
||
|
.type _poly1305_blocks_avx2,@function
|
||
|
.align 16
|
||
|
_poly1305_blocks_avx2:
|
||
|
pushl %ebp
|
||
|
pushl %ebx
|
||
|
pushl %esi
|
||
|
pushl %edi
|
||
|
movl 20(%esp),%edi
|
||
|
movl 24(%esp),%esi
|
||
|
movl 28(%esp),%ecx
|
||
|
movl 20(%edi),%eax
|
||
|
andl $-16,%ecx
|
||
|
jz .L020nodata
|
||
|
cmpl $64,%ecx
|
||
|
jae .L021enter_avx2
|
||
|
testl %eax,%eax
|
||
|
jz .Lenter_blocks
|
||
|
.L021enter_avx2:
|
||
|
vzeroupper
|
||
|
call .L022pic_point
|
||
|
.L022pic_point:
|
||
|
popl %ebx
|
||
|
leal .Lconst_sse2-.L022pic_point(%ebx),%ebx
|
||
|
testl %eax,%eax
|
||
|
jnz .L023base2_26
|
||
|
call _poly1305_init_avx2
|
||
|
movl (%edi),%eax
|
||
|
movl 3(%edi),%ecx
|
||
|
movl 6(%edi),%edx
|
||
|
movl 9(%edi),%esi
|
||
|
movl 13(%edi),%ebp
|
||
|
shrl $2,%ecx
|
||
|
andl $67108863,%eax
|
||
|
shrl $4,%edx
|
||
|
andl $67108863,%ecx
|
||
|
shrl $6,%esi
|
||
|
andl $67108863,%edx
|
||
|
movl %eax,(%edi)
|
||
|
movl %ecx,4(%edi)
|
||
|
movl %edx,8(%edi)
|
||
|
movl %esi,12(%edi)
|
||
|
movl %ebp,16(%edi)
|
||
|
movl $1,20(%edi)
|
||
|
movl 24(%esp),%esi
|
||
|
movl 28(%esp),%ecx
|
||
|
.L023base2_26:
|
||
|
movl 32(%esp),%eax
|
||
|
movl %esp,%ebp
|
||
|
subl $448,%esp
|
||
|
andl $-512,%esp
|
||
|
vmovdqu 48(%edi),%xmm0
|
||
|
leal 288(%esp),%edx
|
||
|
vmovdqu 64(%edi),%xmm1
|
||
|
vmovdqu 80(%edi),%xmm2
|
||
|
vmovdqu 96(%edi),%xmm3
|
||
|
vmovdqu 112(%edi),%xmm4
|
||
|
leal 48(%edi),%edi
|
||
|
vpermq $64,%ymm0,%ymm0
|
||
|
vpermq $64,%ymm1,%ymm1
|
||
|
vpermq $64,%ymm2,%ymm2
|
||
|
vpermq $64,%ymm3,%ymm3
|
||
|
vpermq $64,%ymm4,%ymm4
|
||
|
vpshufd $200,%ymm0,%ymm0
|
||
|
vpshufd $200,%ymm1,%ymm1
|
||
|
vpshufd $200,%ymm2,%ymm2
|
||
|
vpshufd $200,%ymm3,%ymm3
|
||
|
vpshufd $200,%ymm4,%ymm4
|
||
|
vmovdqa %ymm0,-128(%edx)
|
||
|
vmovdqu 80(%edi),%xmm0
|
||
|
vmovdqa %ymm1,-96(%edx)
|
||
|
vmovdqu 96(%edi),%xmm1
|
||
|
vmovdqa %ymm2,-64(%edx)
|
||
|
vmovdqu 112(%edi),%xmm2
|
||
|
vmovdqa %ymm3,-32(%edx)
|
||
|
vmovdqu 128(%edi),%xmm3
|
||
|
vmovdqa %ymm4,(%edx)
|
||
|
vpermq $64,%ymm0,%ymm0
|
||
|
vpermq $64,%ymm1,%ymm1
|
||
|
vpermq $64,%ymm2,%ymm2
|
||
|
vpermq $64,%ymm3,%ymm3
|
||
|
vpshufd $200,%ymm0,%ymm0
|
||
|
vpshufd $200,%ymm1,%ymm1
|
||
|
vpshufd $200,%ymm2,%ymm2
|
||
|
vpshufd $200,%ymm3,%ymm3
|
||
|
vmovdqa %ymm0,32(%edx)
|
||
|
vmovd -48(%edi),%xmm0
|
||
|
vmovdqa %ymm1,64(%edx)
|
||
|
vmovd -44(%edi),%xmm1
|
||
|
vmovdqa %ymm2,96(%edx)
|
||
|
vmovd -40(%edi),%xmm2
|
||
|
vmovdqa %ymm3,128(%edx)
|
||
|
vmovd -36(%edi),%xmm3
|
||
|
vmovd -32(%edi),%xmm4
|
||
|
vmovdqa 64(%ebx),%ymm7
|
||
|
negl %eax
|
||
|
testl $63,%ecx
|
||
|
jz .L024even
|
||
|
movl %ecx,%edx
|
||
|
andl $-64,%ecx
|
||
|
andl $63,%edx
|
||
|
vmovdqu (%esi),%xmm5
|
||
|
cmpl $32,%edx
|
||
|
jb .L025one
|
||
|
vmovdqu 16(%esi),%xmm6
|
||
|
je .L026two
|
||
|
vinserti128 $1,32(%esi),%ymm5,%ymm5
|
||
|
leal 48(%esi),%esi
|
||
|
leal 8(%ebx),%ebx
|
||
|
leal 296(%esp),%edx
|
||
|
jmp .L027tail
|
||
|
.L026two:
|
||
|
leal 32(%esi),%esi
|
||
|
leal 16(%ebx),%ebx
|
||
|
leal 304(%esp),%edx
|
||
|
jmp .L027tail
|
||
|
.L025one:
|
||
|
leal 16(%esi),%esi
|
||
|
vpxor %ymm6,%ymm6,%ymm6
|
||
|
leal 32(%ebx,%eax,8),%ebx
|
||
|
leal 312(%esp),%edx
|
||
|
jmp .L027tail
|
||
|
.align 32
|
||
|
.L024even:
|
||
|
vmovdqu (%esi),%xmm5
|
||
|
vmovdqu 16(%esi),%xmm6
|
||
|
vinserti128 $1,32(%esi),%ymm5,%ymm5
|
||
|
vinserti128 $1,48(%esi),%ymm6,%ymm6
|
||
|
leal 64(%esi),%esi
|
||
|
subl $64,%ecx
|
||
|
jz .L027tail
|
||
|
.L028loop:
|
||
|
vmovdqa %ymm2,64(%esp)
|
||
|
vpsrldq $6,%ymm5,%ymm2
|
||
|
vmovdqa %ymm0,(%esp)
|
||
|
vpsrldq $6,%ymm6,%ymm0
|
||
|
vmovdqa %ymm1,32(%esp)
|
||
|
vpunpckhqdq %ymm6,%ymm5,%ymm1
|
||
|
vpunpcklqdq %ymm6,%ymm5,%ymm5
|
||
|
vpunpcklqdq %ymm0,%ymm2,%ymm2
|
||
|
vpsrlq $30,%ymm2,%ymm0
|
||
|
vpsrlq $4,%ymm2,%ymm2
|
||
|
vpsrlq $26,%ymm5,%ymm6
|
||
|
vpsrlq $40,%ymm1,%ymm1
|
||
|
vpand %ymm7,%ymm2,%ymm2
|
||
|
vpand %ymm7,%ymm5,%ymm5
|
||
|
vpand %ymm7,%ymm6,%ymm6
|
||
|
vpand %ymm7,%ymm0,%ymm0
|
||
|
vpor (%ebx),%ymm1,%ymm1
|
||
|
vpaddq 64(%esp),%ymm2,%ymm2
|
||
|
vpaddq (%esp),%ymm5,%ymm5
|
||
|
vpaddq 32(%esp),%ymm6,%ymm6
|
||
|
vpaddq %ymm3,%ymm0,%ymm0
|
||
|
vpaddq %ymm4,%ymm1,%ymm1
|
||
|
vpmuludq -96(%edx),%ymm2,%ymm3
|
||
|
vmovdqa %ymm6,32(%esp)
|
||
|
vpmuludq -64(%edx),%ymm2,%ymm4
|
||
|
vmovdqa %ymm0,96(%esp)
|
||
|
vpmuludq 96(%edx),%ymm2,%ymm0
|
||
|
vmovdqa %ymm1,128(%esp)
|
||
|
vpmuludq 128(%edx),%ymm2,%ymm1
|
||
|
vpmuludq -128(%edx),%ymm2,%ymm2
|
||
|
vpmuludq -32(%edx),%ymm5,%ymm7
|
||
|
vpaddq %ymm7,%ymm3,%ymm3
|
||
|
vpmuludq (%edx),%ymm5,%ymm6
|
||
|
vpaddq %ymm6,%ymm4,%ymm4
|
||
|
vpmuludq -128(%edx),%ymm5,%ymm7
|
||
|
vpaddq %ymm7,%ymm0,%ymm0
|
||
|
vmovdqa 32(%esp),%ymm7
|
||
|
vpmuludq -96(%edx),%ymm5,%ymm6
|
||
|
vpaddq %ymm6,%ymm1,%ymm1
|
||
|
vpmuludq -64(%edx),%ymm5,%ymm5
|
||
|
vpaddq %ymm5,%ymm2,%ymm2
|
||
|
vpmuludq -64(%edx),%ymm7,%ymm6
|
||
|
vpaddq %ymm6,%ymm3,%ymm3
|
||
|
vpmuludq -32(%edx),%ymm7,%ymm5
|
||
|
vpaddq %ymm5,%ymm4,%ymm4
|
||
|
vpmuludq 128(%edx),%ymm7,%ymm6
|
||
|
vpaddq %ymm6,%ymm0,%ymm0
|
||
|
vmovdqa 96(%esp),%ymm6
|
||
|
vpmuludq -128(%edx),%ymm7,%ymm5
|
||
|
vpaddq %ymm5,%ymm1,%ymm1
|
||
|
vpmuludq -96(%edx),%ymm7,%ymm7
|
||
|
vpaddq %ymm7,%ymm2,%ymm2
|
||
|
vpmuludq -128(%edx),%ymm6,%ymm5
|
||
|
vpaddq %ymm5,%ymm3,%ymm3
|
||
|
vpmuludq -96(%edx),%ymm6,%ymm7
|
||
|
vpaddq %ymm7,%ymm4,%ymm4
|
||
|
vpmuludq 64(%edx),%ymm6,%ymm5
|
||
|
vpaddq %ymm5,%ymm0,%ymm0
|
||
|
vmovdqa 128(%esp),%ymm5
|
||
|
vpmuludq 96(%edx),%ymm6,%ymm7
|
||
|
vpaddq %ymm7,%ymm1,%ymm1
|
||
|
vpmuludq 128(%edx),%ymm6,%ymm6
|
||
|
vpaddq %ymm6,%ymm2,%ymm2
|
||
|
vpmuludq 128(%edx),%ymm5,%ymm7
|
||
|
vpaddq %ymm7,%ymm3,%ymm3
|
||
|
vpmuludq 32(%edx),%ymm5,%ymm6
|
||
|
vpaddq %ymm6,%ymm0,%ymm0
|
||
|
vpmuludq -128(%edx),%ymm5,%ymm7
|
||
|
vpaddq %ymm7,%ymm4,%ymm4
|
||
|
vmovdqa 64(%ebx),%ymm7
|
||
|
vpmuludq 64(%edx),%ymm5,%ymm6
|
||
|
vpaddq %ymm6,%ymm1,%ymm1
|
||
|
vpmuludq 96(%edx),%ymm5,%ymm5
|
||
|
vpaddq %ymm5,%ymm2,%ymm2
|
||
|
vpsrlq $26,%ymm3,%ymm5
|
||
|
vpand %ymm7,%ymm3,%ymm3
|
||
|
vpsrlq $26,%ymm0,%ymm6
|
||
|
vpand %ymm7,%ymm0,%ymm0
|
||
|
vpaddq %ymm5,%ymm4,%ymm4
|
||
|
vpaddq %ymm6,%ymm1,%ymm1
|
||
|
vpsrlq $26,%ymm4,%ymm5
|
||
|
vpand %ymm7,%ymm4,%ymm4
|
||
|
vpsrlq $26,%ymm1,%ymm6
|
||
|
vpand %ymm7,%ymm1,%ymm1
|
||
|
vpaddq %ymm6,%ymm2,%ymm2
|
||
|
vpaddq %ymm5,%ymm0,%ymm0
|
||
|
vpsllq $2,%ymm5,%ymm5
|
||
|
vpsrlq $26,%ymm2,%ymm6
|
||
|
vpand %ymm7,%ymm2,%ymm2
|
||
|
vpaddq %ymm5,%ymm0,%ymm0
|
||
|
vpaddq %ymm6,%ymm3,%ymm3
|
||
|
vpsrlq $26,%ymm3,%ymm6
|
||
|
vpsrlq $26,%ymm0,%ymm5
|
||
|
vpand %ymm7,%ymm0,%ymm0
|
||
|
vpand %ymm7,%ymm3,%ymm3
|
||
|
vpaddq %ymm5,%ymm1,%ymm1
|
||
|
vpaddq %ymm6,%ymm4,%ymm4
|
||
|
vmovdqu (%esi),%xmm5
|
||
|
vmovdqu 16(%esi),%xmm6
|
||
|
vinserti128 $1,32(%esi),%ymm5,%ymm5
|
||
|
vinserti128 $1,48(%esi),%ymm6,%ymm6
|
||
|
leal 64(%esi),%esi
|
||
|
subl $64,%ecx
|
||
|
jnz .L028loop
|
||
|
.L027tail:
|
||
|
vmovdqa %ymm2,64(%esp)
|
||
|
vpsrldq $6,%ymm5,%ymm2
|
||
|
vmovdqa %ymm0,(%esp)
|
||
|
vpsrldq $6,%ymm6,%ymm0
|
||
|
vmovdqa %ymm1,32(%esp)
|
||
|
vpunpckhqdq %ymm6,%ymm5,%ymm1
|
||
|
vpunpcklqdq %ymm6,%ymm5,%ymm5
|
||
|
vpunpcklqdq %ymm0,%ymm2,%ymm2
|
||
|
vpsrlq $30,%ymm2,%ymm0
|
||
|
vpsrlq $4,%ymm2,%ymm2
|
||
|
vpsrlq $26,%ymm5,%ymm6
|
||
|
vpsrlq $40,%ymm1,%ymm1
|
||
|
vpand %ymm7,%ymm2,%ymm2
|
||
|
vpand %ymm7,%ymm5,%ymm5
|
||
|
vpand %ymm7,%ymm6,%ymm6
|
||
|
vpand %ymm7,%ymm0,%ymm0
|
||
|
vpor (%ebx),%ymm1,%ymm1
|
||
|
andl $-64,%ebx
|
||
|
vpaddq 64(%esp),%ymm2,%ymm2
|
||
|
vpaddq (%esp),%ymm5,%ymm5
|
||
|
vpaddq 32(%esp),%ymm6,%ymm6
|
||
|
vpaddq %ymm3,%ymm0,%ymm0
|
||
|
vpaddq %ymm4,%ymm1,%ymm1
|
||
|
vpmuludq -92(%edx),%ymm2,%ymm3
|
||
|
vmovdqa %ymm6,32(%esp)
|
||
|
vpmuludq -60(%edx),%ymm2,%ymm4
|
||
|
vmovdqa %ymm0,96(%esp)
|
||
|
vpmuludq 100(%edx),%ymm2,%ymm0
|
||
|
vmovdqa %ymm1,128(%esp)
|
||
|
vpmuludq 132(%edx),%ymm2,%ymm1
|
||
|
vpmuludq -124(%edx),%ymm2,%ymm2
|
||
|
vpmuludq -28(%edx),%ymm5,%ymm7
|
||
|
vpaddq %ymm7,%ymm3,%ymm3
|
||
|
vpmuludq 4(%edx),%ymm5,%ymm6
|
||
|
vpaddq %ymm6,%ymm4,%ymm4
|
||
|
vpmuludq -124(%edx),%ymm5,%ymm7
|
||
|
vpaddq %ymm7,%ymm0,%ymm0
|
||
|
vmovdqa 32(%esp),%ymm7
|
||
|
vpmuludq -92(%edx),%ymm5,%ymm6
|
||
|
vpaddq %ymm6,%ymm1,%ymm1
|
||
|
vpmuludq -60(%edx),%ymm5,%ymm5
|
||
|
vpaddq %ymm5,%ymm2,%ymm2
|
||
|
vpmuludq -60(%edx),%ymm7,%ymm6
|
||
|
vpaddq %ymm6,%ymm3,%ymm3
|
||
|
vpmuludq -28(%edx),%ymm7,%ymm5
|
||
|
vpaddq %ymm5,%ymm4,%ymm4
|
||
|
vpmuludq 132(%edx),%ymm7,%ymm6
|
||
|
vpaddq %ymm6,%ymm0,%ymm0
|
||
|
vmovdqa 96(%esp),%ymm6
|
||
|
vpmuludq -124(%edx),%ymm7,%ymm5
|
||
|
vpaddq %ymm5,%ymm1,%ymm1
|
||
|
vpmuludq -92(%edx),%ymm7,%ymm7
|
||
|
vpaddq %ymm7,%ymm2,%ymm2
|
||
|
vpmuludq -124(%edx),%ymm6,%ymm5
|
||
|
vpaddq %ymm5,%ymm3,%ymm3
|
||
|
vpmuludq -92(%edx),%ymm6,%ymm7
|
||
|
vpaddq %ymm7,%ymm4,%ymm4
|
||
|
vpmuludq 68(%edx),%ymm6,%ymm5
|
||
|
vpaddq %ymm5,%ymm0,%ymm0
|
||
|
vmovdqa 128(%esp),%ymm5
|
||
|
vpmuludq 100(%edx),%ymm6,%ymm7
|
||
|
vpaddq %ymm7,%ymm1,%ymm1
|
||
|
vpmuludq 132(%edx),%ymm6,%ymm6
|
||
|
vpaddq %ymm6,%ymm2,%ymm2
|
||
|
vpmuludq 132(%edx),%ymm5,%ymm7
|
||
|
vpaddq %ymm7,%ymm3,%ymm3
|
||
|
vpmuludq 36(%edx),%ymm5,%ymm6
|
||
|
vpaddq %ymm6,%ymm0,%ymm0
|
||
|
vpmuludq -124(%edx),%ymm5,%ymm7
|
||
|
vpaddq %ymm7,%ymm4,%ymm4
|
||
|
vmovdqa 64(%ebx),%ymm7
|
||
|
vpmuludq 68(%edx),%ymm5,%ymm6
|
||
|
vpaddq %ymm6,%ymm1,%ymm1
|
||
|
vpmuludq 100(%edx),%ymm5,%ymm5
|
||
|
vpaddq %ymm5,%ymm2,%ymm2
|
||
|
vpsrldq $8,%ymm4,%ymm5
|
||
|
vpsrldq $8,%ymm3,%ymm6
|
||
|
vpaddq %ymm5,%ymm4,%ymm4
|
||
|
vpsrldq $8,%ymm0,%ymm5
|
||
|
vpaddq %ymm6,%ymm3,%ymm3
|
||
|
vpsrldq $8,%ymm1,%ymm6
|
||
|
vpaddq %ymm5,%ymm0,%ymm0
|
||
|
vpsrldq $8,%ymm2,%ymm5
|
||
|
vpaddq %ymm6,%ymm1,%ymm1
|
||
|
vpermq $2,%ymm4,%ymm6
|
||
|
vpaddq %ymm5,%ymm2,%ymm2
|
||
|
vpermq $2,%ymm3,%ymm5
|
||
|
vpaddq %ymm6,%ymm4,%ymm4
|
||
|
vpermq $2,%ymm0,%ymm6
|
||
|
vpaddq %ymm5,%ymm3,%ymm3
|
||
|
vpermq $2,%ymm1,%ymm5
|
||
|
vpaddq %ymm6,%ymm0,%ymm0
|
||
|
vpermq $2,%ymm2,%ymm6
|
||
|
vpaddq %ymm5,%ymm1,%ymm1
|
||
|
vpaddq %ymm6,%ymm2,%ymm2
|
||
|
vpsrlq $26,%ymm3,%ymm5
|
||
|
vpand %ymm7,%ymm3,%ymm3
|
||
|
vpsrlq $26,%ymm0,%ymm6
|
||
|
vpand %ymm7,%ymm0,%ymm0
|
||
|
vpaddq %ymm5,%ymm4,%ymm4
|
||
|
vpaddq %ymm6,%ymm1,%ymm1
|
||
|
vpsrlq $26,%ymm4,%ymm5
|
||
|
vpand %ymm7,%ymm4,%ymm4
|
||
|
vpsrlq $26,%ymm1,%ymm6
|
||
|
vpand %ymm7,%ymm1,%ymm1
|
||
|
vpaddq %ymm6,%ymm2,%ymm2
|
||
|
vpaddq %ymm5,%ymm0,%ymm0
|
||
|
vpsllq $2,%ymm5,%ymm5
|
||
|
vpsrlq $26,%ymm2,%ymm6
|
||
|
vpand %ymm7,%ymm2,%ymm2
|
||
|
vpaddq %ymm5,%ymm0,%ymm0
|
||
|
vpaddq %ymm6,%ymm3,%ymm3
|
||
|
vpsrlq $26,%ymm3,%ymm6
|
||
|
vpsrlq $26,%ymm0,%ymm5
|
||
|
vpand %ymm7,%ymm0,%ymm0
|
||
|
vpand %ymm7,%ymm3,%ymm3
|
||
|
vpaddq %ymm5,%ymm1,%ymm1
|
||
|
vpaddq %ymm6,%ymm4,%ymm4
|
||
|
cmpl $0,%ecx
|
||
|
je .L029done
|
||
|
vpshufd $252,%xmm0,%xmm0
|
||
|
leal 288(%esp),%edx
|
||
|
vpshufd $252,%xmm1,%xmm1
|
||
|
vpshufd $252,%xmm2,%xmm2
|
||
|
vpshufd $252,%xmm3,%xmm3
|
||
|
vpshufd $252,%xmm4,%xmm4
|
||
|
jmp .L024even
|
||
|
.align 16
|
||
|
.L029done:
|
||
|
vmovd %xmm0,-48(%edi)
|
||
|
vmovd %xmm1,-44(%edi)
|
||
|
vmovd %xmm2,-40(%edi)
|
||
|
vmovd %xmm3,-36(%edi)
|
||
|
vmovd %xmm4,-32(%edi)
|
||
|
vzeroupper
|
||
|
movl %ebp,%esp
|
||
|
.L020nodata:
|
||
|
popl %edi
|
||
|
popl %esi
|
||
|
popl %ebx
|
||
|
popl %ebp
|
||
|
ret
|
||
|
.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2
|
||
|
.align 64
|
||
|
.Lconst_sse2:
|
||
|
.long 16777216,0,16777216,0,16777216,0,16777216,0
|
||
|
.long 0,0,0,0,0,0,0,0
|
||
|
.long 67108863,0,67108863,0,67108863,0,67108863,0
|
||
|
.long 268435455,268435452,268435452,268435452
|
||
|
.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
|
||
|
.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
|
||
|
.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
|
||
|
.byte 114,103,62,0
|
||
|
.align 4
|
||
|
.comm OPENSSL_ia32cap_P,16,4
|
||
|
#endif
|