f-stack/freebsd/crypto/openssl/i386/poly1305-x86.S

3826 lines
79 KiB
ArmAsm
Raw Normal View History

/* $FreeBSD$ */
/* Do not modify. This file is auto-generated from poly1305-x86.pl. */
#ifdef PIC
.text
.align 64
.globl poly1305_init
.type poly1305_init,@function
.align 16
poly1305_init:
.L_poly1305_init_begin:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ebp
xorl %eax,%eax
movl %eax,(%edi)
movl %eax,4(%edi)
movl %eax,8(%edi)
movl %eax,12(%edi)
movl %eax,16(%edi)
movl %eax,20(%edi)
cmpl $0,%esi
je .L000nokey
call .L001pic_point
.L001pic_point:
popl %ebx
leal poly1305_blocks-.L001pic_point(%ebx),%eax
leal poly1305_emit-.L001pic_point(%ebx),%edx
leal OPENSSL_ia32cap_P-.L001pic_point(%ebx),%edi
movl (%edi),%ecx
andl $83886080,%ecx
cmpl $83886080,%ecx
jne .L002no_sse2
leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx
movl 8(%edi),%ecx
testl $32,%ecx
jz .L002no_sse2
leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
.L002no_sse2:
movl 20(%esp),%edi
movl %eax,(%ebp)
movl %edx,4(%ebp)
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
movl 12(%esi),%edx
andl $268435455,%eax
andl $268435452,%ebx
andl $268435452,%ecx
andl $268435452,%edx
movl %eax,24(%edi)
movl %ebx,28(%edi)
movl %ecx,32(%edi)
movl %edx,36(%edi)
movl $1,%eax
.L000nokey:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size poly1305_init,.-.L_poly1305_init_begin
.globl poly1305_blocks
.type poly1305_blocks,@function
.align 16
poly1305_blocks:
.L_poly1305_blocks_begin:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ecx
.Lenter_blocks:
andl $-15,%ecx
jz .L003nodata
subl $64,%esp
movl 24(%edi),%eax
movl 28(%edi),%ebx
leal (%esi,%ecx,1),%ebp
movl 32(%edi),%ecx
movl 36(%edi),%edx
movl %ebp,92(%esp)
movl %esi,%ebp
movl %eax,36(%esp)
movl %ebx,%eax
shrl $2,%eax
movl %ebx,40(%esp)
addl %ebx,%eax
movl %ecx,%ebx
shrl $2,%ebx
movl %ecx,44(%esp)
addl %ecx,%ebx
movl %edx,%ecx
shrl $2,%ecx
movl %edx,48(%esp)
addl %edx,%ecx
movl %eax,52(%esp)
movl %ebx,56(%esp)
movl %ecx,60(%esp)
movl (%edi),%eax
movl 4(%edi),%ebx
movl 8(%edi),%ecx
movl 12(%edi),%esi
movl 16(%edi),%edi
jmp .L004loop
.align 32
.L004loop:
addl (%ebp),%eax
adcl 4(%ebp),%ebx
adcl 8(%ebp),%ecx
adcl 12(%ebp),%esi
leal 16(%ebp),%ebp
adcl 96(%esp),%edi
movl %eax,(%esp)
movl %esi,12(%esp)
mull 36(%esp)
movl %edi,16(%esp)
movl %eax,%edi
movl %ebx,%eax
movl %edx,%esi
mull 60(%esp)
addl %eax,%edi
movl %ecx,%eax
adcl %edx,%esi
mull 56(%esp)
addl %eax,%edi
movl 12(%esp),%eax
adcl %edx,%esi
mull 52(%esp)
addl %eax,%edi
movl (%esp),%eax
adcl %edx,%esi
mull 40(%esp)
movl %edi,20(%esp)
xorl %edi,%edi
addl %eax,%esi
movl %ebx,%eax
adcl %edx,%edi
mull 36(%esp)
addl %eax,%esi
movl %ecx,%eax
adcl %edx,%edi
mull 60(%esp)
addl %eax,%esi
movl 12(%esp),%eax
adcl %edx,%edi
mull 56(%esp)
addl %eax,%esi
movl 16(%esp),%eax
adcl %edx,%edi
imull 52(%esp),%eax
addl %eax,%esi
movl (%esp),%eax
adcl $0,%edi
mull 44(%esp)
movl %esi,24(%esp)
xorl %esi,%esi
addl %eax,%edi
movl %ebx,%eax
adcl %edx,%esi
mull 40(%esp)
addl %eax,%edi
movl %ecx,%eax
adcl %edx,%esi
mull 36(%esp)
addl %eax,%edi
movl 12(%esp),%eax
adcl %edx,%esi
mull 60(%esp)
addl %eax,%edi
movl 16(%esp),%eax
adcl %edx,%esi
imull 56(%esp),%eax
addl %eax,%edi
movl (%esp),%eax
adcl $0,%esi
mull 48(%esp)
movl %edi,28(%esp)
xorl %edi,%edi
addl %eax,%esi
movl %ebx,%eax
adcl %edx,%edi
mull 44(%esp)
addl %eax,%esi
movl %ecx,%eax
adcl %edx,%edi
mull 40(%esp)
addl %eax,%esi
movl 12(%esp),%eax
adcl %edx,%edi
mull 36(%esp)
addl %eax,%esi
movl 16(%esp),%ecx
adcl %edx,%edi
movl %ecx,%edx
imull 60(%esp),%ecx
addl %ecx,%esi
movl 20(%esp),%eax
adcl $0,%edi
imull 36(%esp),%edx
addl %edi,%edx
movl 24(%esp),%ebx
movl 28(%esp),%ecx
movl %edx,%edi
shrl $2,%edx
andl $3,%edi
leal (%edx,%edx,4),%edx
addl %edx,%eax
adcl $0,%ebx
adcl $0,%ecx
adcl $0,%esi
adcl $0,%edi
cmpl 92(%esp),%ebp
jne .L004loop
movl 84(%esp),%edx
addl $64,%esp
movl %eax,(%edx)
movl %ebx,4(%edx)
movl %ecx,8(%edx)
movl %esi,12(%edx)
movl %edi,16(%edx)
.L003nodata:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size poly1305_blocks,.-.L_poly1305_blocks_begin
.globl poly1305_emit
.type poly1305_emit,@function
.align 16
poly1305_emit:
.L_poly1305_emit_begin:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%ebp
.Lenter_emit:
movl 24(%esp),%edi
movl (%ebp),%eax
movl 4(%ebp),%ebx
movl 8(%ebp),%ecx
movl 12(%ebp),%edx
movl 16(%ebp),%esi
addl $5,%eax
adcl $0,%ebx
adcl $0,%ecx
adcl $0,%edx
adcl $0,%esi
shrl $2,%esi
negl %esi
andl %esi,%eax
andl %esi,%ebx
andl %esi,%ecx
andl %esi,%edx
movl %eax,(%edi)
movl %ebx,4(%edi)
movl %ecx,8(%edi)
movl %edx,12(%edi)
notl %esi
movl (%ebp),%eax
movl 4(%ebp),%ebx
movl 8(%ebp),%ecx
movl 12(%ebp),%edx
movl 28(%esp),%ebp
andl %esi,%eax
andl %esi,%ebx
andl %esi,%ecx
andl %esi,%edx
orl (%edi),%eax
orl 4(%edi),%ebx
orl 8(%edi),%ecx
orl 12(%edi),%edx
addl (%ebp),%eax
adcl 4(%ebp),%ebx
adcl 8(%ebp),%ecx
adcl 12(%ebp),%edx
movl %eax,(%edi)
movl %ebx,4(%edi)
movl %ecx,8(%edi)
movl %edx,12(%edi)
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size poly1305_emit,.-.L_poly1305_emit_begin
.align 32
.type _poly1305_init_sse2,@function
.align 16
_poly1305_init_sse2:
movdqu 24(%edi),%xmm4
leal 48(%edi),%edi
movl %esp,%ebp
subl $224,%esp
andl $-16,%esp
movq 64(%ebx),%xmm7
movdqa %xmm4,%xmm0
movdqa %xmm4,%xmm1
movdqa %xmm4,%xmm2
pand %xmm7,%xmm0
psrlq $26,%xmm1
psrldq $6,%xmm2
pand %xmm7,%xmm1
movdqa %xmm2,%xmm3
psrlq $4,%xmm2
psrlq $30,%xmm3
pand %xmm7,%xmm2
pand %xmm7,%xmm3
psrldq $13,%xmm4
leal 144(%esp),%edx
movl $2,%ecx
.L005square:
movdqa %xmm0,(%esp)
movdqa %xmm1,16(%esp)
movdqa %xmm2,32(%esp)
movdqa %xmm3,48(%esp)
movdqa %xmm4,64(%esp)
movdqa %xmm1,%xmm6
movdqa %xmm2,%xmm5
pslld $2,%xmm6
pslld $2,%xmm5
paddd %xmm1,%xmm6
paddd %xmm2,%xmm5
movdqa %xmm6,80(%esp)
movdqa %xmm5,96(%esp)
movdqa %xmm3,%xmm6
movdqa %xmm4,%xmm5
pslld $2,%xmm6
pslld $2,%xmm5
paddd %xmm3,%xmm6
paddd %xmm4,%xmm5
movdqa %xmm6,112(%esp)
movdqa %xmm5,128(%esp)
pshufd $68,%xmm0,%xmm6
movdqa %xmm1,%xmm5
pshufd $68,%xmm1,%xmm1
pshufd $68,%xmm2,%xmm2
pshufd $68,%xmm3,%xmm3
pshufd $68,%xmm4,%xmm4
movdqa %xmm6,(%edx)
movdqa %xmm1,16(%edx)
movdqa %xmm2,32(%edx)
movdqa %xmm3,48(%edx)
movdqa %xmm4,64(%edx)
pmuludq %xmm0,%xmm4
pmuludq %xmm0,%xmm3
pmuludq %xmm0,%xmm2
pmuludq %xmm0,%xmm1
pmuludq %xmm6,%xmm0
movdqa %xmm5,%xmm6
pmuludq 48(%edx),%xmm5
movdqa %xmm6,%xmm7
pmuludq 32(%edx),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%edx),%xmm7
paddq %xmm6,%xmm3
movdqa 80(%esp),%xmm6
pmuludq (%edx),%xmm5
paddq %xmm7,%xmm2
pmuludq 64(%edx),%xmm6
movdqa 32(%esp),%xmm7
paddq %xmm5,%xmm1
movdqa %xmm7,%xmm5
pmuludq 32(%edx),%xmm7
paddq %xmm6,%xmm0
movdqa %xmm5,%xmm6
pmuludq 16(%edx),%xmm5
paddq %xmm7,%xmm4
movdqa 96(%esp),%xmm7
pmuludq (%edx),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq 64(%edx),%xmm7
paddq %xmm6,%xmm2
pmuludq 48(%edx),%xmm5
movdqa 48(%esp),%xmm6
paddq %xmm7,%xmm1
movdqa %xmm6,%xmm7
pmuludq 16(%edx),%xmm6
paddq %xmm5,%xmm0
movdqa 112(%esp),%xmm5
pmuludq (%edx),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 64(%edx),%xmm5
paddq %xmm7,%xmm3
movdqa %xmm6,%xmm7
pmuludq 48(%edx),%xmm6
paddq %xmm5,%xmm2
pmuludq 32(%edx),%xmm7
movdqa 64(%esp),%xmm5
paddq %xmm6,%xmm1
movdqa 128(%esp),%xmm6
pmuludq (%edx),%xmm5
paddq %xmm7,%xmm0
movdqa %xmm6,%xmm7
pmuludq 64(%edx),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%edx),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq 32(%edx),%xmm5
paddq %xmm7,%xmm0
pmuludq 48(%edx),%xmm6
movdqa 64(%ebx),%xmm7
paddq %xmm5,%xmm1
paddq %xmm6,%xmm2
movdqa %xmm3,%xmm5
pand %xmm7,%xmm3
psrlq $26,%xmm5
paddq %xmm4,%xmm5
movdqa %xmm0,%xmm6
pand %xmm7,%xmm0
psrlq $26,%xmm6
movdqa %xmm5,%xmm4
paddq %xmm1,%xmm6
psrlq $26,%xmm5
pand %xmm7,%xmm4
movdqa %xmm6,%xmm1
psrlq $26,%xmm6
paddd %xmm5,%xmm0
psllq $2,%xmm5
paddq %xmm2,%xmm6
paddq %xmm0,%xmm5
pand %xmm7,%xmm1
movdqa %xmm6,%xmm2
psrlq $26,%xmm6
pand %xmm7,%xmm2
paddd %xmm3,%xmm6
movdqa %xmm5,%xmm0
psrlq $26,%xmm5
movdqa %xmm6,%xmm3
psrlq $26,%xmm6
pand %xmm7,%xmm0
paddd %xmm5,%xmm1
pand %xmm7,%xmm3
paddd %xmm6,%xmm4
decl %ecx
jz .L006square_break
punpcklqdq (%esp),%xmm0
punpcklqdq 16(%esp),%xmm1
punpcklqdq 32(%esp),%xmm2
punpcklqdq 48(%esp),%xmm3
punpcklqdq 64(%esp),%xmm4
jmp .L005square
.L006square_break:
psllq $32,%xmm0
psllq $32,%xmm1
psllq $32,%xmm2
psllq $32,%xmm3
psllq $32,%xmm4
por (%esp),%xmm0
por 16(%esp),%xmm1
por 32(%esp),%xmm2
por 48(%esp),%xmm3
por 64(%esp),%xmm4
pshufd $141,%xmm0,%xmm0
pshufd $141,%xmm1,%xmm1
pshufd $141,%xmm2,%xmm2
pshufd $141,%xmm3,%xmm3
pshufd $141,%xmm4,%xmm4
movdqu %xmm0,(%edi)
movdqu %xmm1,16(%edi)
movdqu %xmm2,32(%edi)
movdqu %xmm3,48(%edi)
movdqu %xmm4,64(%edi)
movdqa %xmm1,%xmm6
movdqa %xmm2,%xmm5
pslld $2,%xmm6
pslld $2,%xmm5
paddd %xmm1,%xmm6
paddd %xmm2,%xmm5
movdqu %xmm6,80(%edi)
movdqu %xmm5,96(%edi)
movdqa %xmm3,%xmm6
movdqa %xmm4,%xmm5
pslld $2,%xmm6
pslld $2,%xmm5
paddd %xmm3,%xmm6
paddd %xmm4,%xmm5
movdqu %xmm6,112(%edi)
movdqu %xmm5,128(%edi)
movl %ebp,%esp
leal -48(%edi),%edi
ret
.size _poly1305_init_sse2,.-_poly1305_init_sse2
.align 32
.type _poly1305_blocks_sse2,@function
.align 16
_poly1305_blocks_sse2:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ecx
movl 20(%edi),%eax
andl $-16,%ecx
jz .L007nodata
cmpl $64,%ecx
jae .L008enter_sse2
testl %eax,%eax
jz .Lenter_blocks
.align 16
.L008enter_sse2:
call .L009pic_point
.L009pic_point:
popl %ebx
leal .Lconst_sse2-.L009pic_point(%ebx),%ebx
testl %eax,%eax
jnz .L010base2_26
call _poly1305_init_sse2
movl (%edi),%eax
movl 3(%edi),%ecx
movl 6(%edi),%edx
movl 9(%edi),%esi
movl 13(%edi),%ebp
movl $1,20(%edi)
shrl $2,%ecx
andl $67108863,%eax
shrl $4,%edx
andl $67108863,%ecx
shrl $6,%esi
andl $67108863,%edx
movd %eax,%xmm0
movd %ecx,%xmm1
movd %edx,%xmm2
movd %esi,%xmm3
movd %ebp,%xmm4
movl 24(%esp),%esi
movl 28(%esp),%ecx
jmp .L011base2_32
.align 16
.L010base2_26:
movd (%edi),%xmm0
movd 4(%edi),%xmm1
movd 8(%edi),%xmm2
movd 12(%edi),%xmm3
movd 16(%edi),%xmm4
movdqa 64(%ebx),%xmm7
.L011base2_32:
movl 32(%esp),%eax
movl %esp,%ebp
subl $528,%esp
andl $-16,%esp
leal 48(%edi),%edi
shll $24,%eax
testl $31,%ecx
jz .L012even
movdqu (%esi),%xmm6
leal 16(%esi),%esi
movdqa %xmm6,%xmm5
pand %xmm7,%xmm6
paddd %xmm6,%xmm0
movdqa %xmm5,%xmm6
psrlq $26,%xmm5
psrldq $6,%xmm6
pand %xmm7,%xmm5
paddd %xmm5,%xmm1
movdqa %xmm6,%xmm5
psrlq $4,%xmm6
pand %xmm7,%xmm6
paddd %xmm6,%xmm2
movdqa %xmm5,%xmm6
psrlq $30,%xmm5
pand %xmm7,%xmm5
psrldq $7,%xmm6
paddd %xmm5,%xmm3
movd %eax,%xmm5
paddd %xmm6,%xmm4
movd 12(%edi),%xmm6
paddd %xmm5,%xmm4
movdqa %xmm0,(%esp)
movdqa %xmm1,16(%esp)
movdqa %xmm2,32(%esp)
movdqa %xmm3,48(%esp)
movdqa %xmm4,64(%esp)
pmuludq %xmm6,%xmm0
pmuludq %xmm6,%xmm1
pmuludq %xmm6,%xmm2
movd 28(%edi),%xmm5
pmuludq %xmm6,%xmm3
pmuludq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 48(%esp),%xmm5
movdqa %xmm6,%xmm7
pmuludq 32(%esp),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%esp),%xmm7
paddq %xmm6,%xmm3
movd 92(%edi),%xmm6
pmuludq (%esp),%xmm5
paddq %xmm7,%xmm2
pmuludq 64(%esp),%xmm6
movd 44(%edi),%xmm7
paddq %xmm5,%xmm1
movdqa %xmm7,%xmm5
pmuludq 32(%esp),%xmm7
paddq %xmm6,%xmm0
movdqa %xmm5,%xmm6
pmuludq 16(%esp),%xmm5
paddq %xmm7,%xmm4
movd 108(%edi),%xmm7
pmuludq (%esp),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq 64(%esp),%xmm7
paddq %xmm6,%xmm2
pmuludq 48(%esp),%xmm5
movd 60(%edi),%xmm6
paddq %xmm7,%xmm1
movdqa %xmm6,%xmm7
pmuludq 16(%esp),%xmm6
paddq %xmm5,%xmm0
movd 124(%edi),%xmm5
pmuludq (%esp),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 64(%esp),%xmm5
paddq %xmm7,%xmm3
movdqa %xmm6,%xmm7
pmuludq 48(%esp),%xmm6
paddq %xmm5,%xmm2
pmuludq 32(%esp),%xmm7
movd 76(%edi),%xmm5
paddq %xmm6,%xmm1
movd 140(%edi),%xmm6
pmuludq (%esp),%xmm5
paddq %xmm7,%xmm0
movdqa %xmm6,%xmm7
pmuludq 64(%esp),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%esp),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq 32(%esp),%xmm5
paddq %xmm7,%xmm0
pmuludq 48(%esp),%xmm6
movdqa 64(%ebx),%xmm7
paddq %xmm5,%xmm1
paddq %xmm6,%xmm2
movdqa %xmm3,%xmm5
pand %xmm7,%xmm3
psrlq $26,%xmm5
paddq %xmm4,%xmm5
movdqa %xmm0,%xmm6
pand %xmm7,%xmm0
psrlq $26,%xmm6
movdqa %xmm5,%xmm4
paddq %xmm1,%xmm6
psrlq $26,%xmm5
pand %xmm7,%xmm4
movdqa %xmm6,%xmm1
psrlq $26,%xmm6
paddd %xmm5,%xmm0
psllq $2,%xmm5
paddq %xmm2,%xmm6
paddq %xmm0,%xmm5
pand %xmm7,%xmm1
movdqa %xmm6,%xmm2
psrlq $26,%xmm6
pand %xmm7,%xmm2
paddd %xmm3,%xmm6
movdqa %xmm5,%xmm0
psrlq $26,%xmm5
movdqa %xmm6,%xmm3
psrlq $26,%xmm6
pand %xmm7,%xmm0
paddd %xmm5,%xmm1
pand %xmm7,%xmm3
paddd %xmm6,%xmm4
subl $16,%ecx
jz .L013done
.L012even:
leal 384(%esp),%edx
leal -32(%esi),%eax
subl $64,%ecx
movdqu (%edi),%xmm5
pshufd $68,%xmm5,%xmm6
cmovbl %eax,%esi
pshufd $238,%xmm5,%xmm5
movdqa %xmm6,(%edx)
leal 160(%esp),%eax
movdqu 16(%edi),%xmm6
movdqa %xmm5,-144(%edx)
pshufd $68,%xmm6,%xmm5
pshufd $238,%xmm6,%xmm6
movdqa %xmm5,16(%edx)
movdqu 32(%edi),%xmm5
movdqa %xmm6,-128(%edx)
pshufd $68,%xmm5,%xmm6
pshufd $238,%xmm5,%xmm5
movdqa %xmm6,32(%edx)
movdqu 48(%edi),%xmm6
movdqa %xmm5,-112(%edx)
pshufd $68,%xmm6,%xmm5
pshufd $238,%xmm6,%xmm6
movdqa %xmm5,48(%edx)
movdqu 64(%edi),%xmm5
movdqa %xmm6,-96(%edx)
pshufd $68,%xmm5,%xmm6
pshufd $238,%xmm5,%xmm5
movdqa %xmm6,64(%edx)
movdqu 80(%edi),%xmm6
movdqa %xmm5,-80(%edx)
pshufd $68,%xmm6,%xmm5
pshufd $238,%xmm6,%xmm6
movdqa %xmm5,80(%edx)
movdqu 96(%edi),%xmm5
movdqa %xmm6,-64(%edx)
pshufd $68,%xmm5,%xmm6
pshufd $238,%xmm5,%xmm5
movdqa %xmm6,96(%edx)
movdqu 112(%edi),%xmm6
movdqa %xmm5,-48(%edx)
pshufd $68,%xmm6,%xmm5
pshufd $238,%xmm6,%xmm6
movdqa %xmm5,112(%edx)
movdqu 128(%edi),%xmm5
movdqa %xmm6,-32(%edx)
pshufd $68,%xmm5,%xmm6
pshufd $238,%xmm5,%xmm5
movdqa %xmm6,128(%edx)
movdqa %xmm5,-16(%edx)
movdqu 32(%esi),%xmm5
movdqu 48(%esi),%xmm6
leal 32(%esi),%esi
movdqa %xmm2,112(%esp)
movdqa %xmm3,128(%esp)
movdqa %xmm4,144(%esp)
movdqa %xmm5,%xmm2
movdqa %xmm6,%xmm3
psrldq $6,%xmm2
psrldq $6,%xmm3
movdqa %xmm5,%xmm4
punpcklqdq %xmm3,%xmm2
punpckhqdq %xmm6,%xmm4
punpcklqdq %xmm6,%xmm5
movdqa %xmm2,%xmm3
psrlq $4,%xmm2
psrlq $30,%xmm3
movdqa %xmm5,%xmm6
psrlq $40,%xmm4
psrlq $26,%xmm6
pand %xmm7,%xmm5
pand %xmm7,%xmm6
pand %xmm7,%xmm2
pand %xmm7,%xmm3
por (%ebx),%xmm4
movdqa %xmm0,80(%esp)
movdqa %xmm1,96(%esp)
jbe .L014skip_loop
jmp .L015loop
.align 32
.L015loop:
movdqa -144(%edx),%xmm7
movdqa %xmm6,16(%eax)
movdqa %xmm2,32(%eax)
movdqa %xmm3,48(%eax)
movdqa %xmm4,64(%eax)
movdqa %xmm5,%xmm1
pmuludq %xmm7,%xmm5
movdqa %xmm6,%xmm0
pmuludq %xmm7,%xmm6
pmuludq %xmm7,%xmm2
pmuludq %xmm7,%xmm3
pmuludq %xmm7,%xmm4
pmuludq -16(%edx),%xmm0
movdqa %xmm1,%xmm7
pmuludq -128(%edx),%xmm1
paddq %xmm5,%xmm0
movdqa %xmm7,%xmm5
pmuludq -112(%edx),%xmm7
paddq %xmm6,%xmm1
movdqa %xmm5,%xmm6
pmuludq -96(%edx),%xmm5
paddq %xmm7,%xmm2
movdqa 16(%eax),%xmm7
pmuludq -80(%edx),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq -128(%edx),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq -112(%edx),%xmm5
paddq %xmm7,%xmm2
movdqa 32(%eax),%xmm7
pmuludq -96(%edx),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq -32(%edx),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq -16(%edx),%xmm5
paddq %xmm7,%xmm0
movdqa %xmm6,%xmm7
pmuludq -128(%edx),%xmm6
paddq %xmm5,%xmm1
movdqa 48(%eax),%xmm5
pmuludq -112(%edx),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq -48(%edx),%xmm5
paddq %xmm7,%xmm4
movdqa %xmm6,%xmm7
pmuludq -32(%edx),%xmm6
paddq %xmm5,%xmm0
movdqa %xmm7,%xmm5
pmuludq -16(%edx),%xmm7
paddq %xmm6,%xmm1
movdqa 64(%eax),%xmm6
pmuludq -128(%edx),%xmm5
paddq %xmm7,%xmm2
movdqa %xmm6,%xmm7
pmuludq -16(%edx),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq -64(%edx),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq -48(%edx),%xmm5
paddq %xmm7,%xmm0
movdqa 64(%ebx),%xmm7
pmuludq -32(%edx),%xmm6
paddq %xmm5,%xmm1
paddq %xmm6,%xmm2
movdqu -32(%esi),%xmm5
movdqu -16(%esi),%xmm6
leal 32(%esi),%esi
movdqa %xmm2,32(%esp)
movdqa %xmm3,48(%esp)
movdqa %xmm4,64(%esp)
movdqa %xmm5,%xmm2
movdqa %xmm6,%xmm3
psrldq $6,%xmm2
psrldq $6,%xmm3
movdqa %xmm5,%xmm4
punpcklqdq %xmm3,%xmm2
punpckhqdq %xmm6,%xmm4
punpcklqdq %xmm6,%xmm5
movdqa %xmm2,%xmm3
psrlq $4,%xmm2
psrlq $30,%xmm3
movdqa %xmm5,%xmm6
psrlq $40,%xmm4
psrlq $26,%xmm6
pand %xmm7,%xmm5
pand %xmm7,%xmm6
pand %xmm7,%xmm2
pand %xmm7,%xmm3
por (%ebx),%xmm4
leal -32(%esi),%eax
subl $64,%ecx
paddd 80(%esp),%xmm5
paddd 96(%esp),%xmm6
paddd 112(%esp),%xmm2
paddd 128(%esp),%xmm3
paddd 144(%esp),%xmm4
cmovbl %eax,%esi
leal 160(%esp),%eax
movdqa (%edx),%xmm7
movdqa %xmm1,16(%esp)
movdqa %xmm6,16(%eax)
movdqa %xmm2,32(%eax)
movdqa %xmm3,48(%eax)
movdqa %xmm4,64(%eax)
movdqa %xmm5,%xmm1
pmuludq %xmm7,%xmm5
paddq %xmm0,%xmm5
movdqa %xmm6,%xmm0
pmuludq %xmm7,%xmm6
pmuludq %xmm7,%xmm2
pmuludq %xmm7,%xmm3
pmuludq %xmm7,%xmm4
paddq 16(%esp),%xmm6
paddq 32(%esp),%xmm2
paddq 48(%esp),%xmm3
paddq 64(%esp),%xmm4
pmuludq 128(%edx),%xmm0
movdqa %xmm1,%xmm7
pmuludq 16(%edx),%xmm1
paddq %xmm5,%xmm0
movdqa %xmm7,%xmm5
pmuludq 32(%edx),%xmm7
paddq %xmm6,%xmm1
movdqa %xmm5,%xmm6
pmuludq 48(%edx),%xmm5
paddq %xmm7,%xmm2
movdqa 16(%eax),%xmm7
pmuludq 64(%edx),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq 16(%edx),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 32(%edx),%xmm5
paddq %xmm7,%xmm2
movdqa 32(%eax),%xmm7
pmuludq 48(%edx),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq 112(%edx),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 128(%edx),%xmm5
paddq %xmm7,%xmm0
movdqa %xmm6,%xmm7
pmuludq 16(%edx),%xmm6
paddq %xmm5,%xmm1
movdqa 48(%eax),%xmm5
pmuludq 32(%edx),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq 96(%edx),%xmm5
paddq %xmm7,%xmm4
movdqa %xmm6,%xmm7
pmuludq 112(%edx),%xmm6
paddq %xmm5,%xmm0
movdqa %xmm7,%xmm5
pmuludq 128(%edx),%xmm7
paddq %xmm6,%xmm1
movdqa 64(%eax),%xmm6
pmuludq 16(%edx),%xmm5
paddq %xmm7,%xmm2
movdqa %xmm6,%xmm7
pmuludq 128(%edx),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 80(%edx),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq 96(%edx),%xmm5
paddq %xmm7,%xmm0
movdqa 64(%ebx),%xmm7
pmuludq 112(%edx),%xmm6
paddq %xmm5,%xmm1
paddq %xmm6,%xmm2
movdqa %xmm3,%xmm5
pand %xmm7,%xmm3
psrlq $26,%xmm5
paddq %xmm4,%xmm5
movdqa %xmm0,%xmm6
pand %xmm7,%xmm0
psrlq $26,%xmm6
movdqa %xmm5,%xmm4
paddq %xmm1,%xmm6
psrlq $26,%xmm5
pand %xmm7,%xmm4
movdqa %xmm6,%xmm1
psrlq $26,%xmm6
paddd %xmm5,%xmm0
psllq $2,%xmm5
paddq %xmm2,%xmm6
paddq %xmm0,%xmm5
pand %xmm7,%xmm1
movdqa %xmm6,%xmm2
psrlq $26,%xmm6
pand %xmm7,%xmm2
paddd %xmm3,%xmm6
movdqa %xmm5,%xmm0
psrlq $26,%xmm5
movdqa %xmm6,%xmm3
psrlq $26,%xmm6
pand %xmm7,%xmm0
paddd %xmm5,%xmm1
pand %xmm7,%xmm3
paddd %xmm6,%xmm4
movdqu 32(%esi),%xmm5
movdqu 48(%esi),%xmm6
leal 32(%esi),%esi
movdqa %xmm2,112(%esp)
movdqa %xmm3,128(%esp)
movdqa %xmm4,144(%esp)
movdqa %xmm5,%xmm2
movdqa %xmm6,%xmm3
psrldq $6,%xmm2
psrldq $6,%xmm3
movdqa %xmm5,%xmm4
punpcklqdq %xmm3,%xmm2
punpckhqdq %xmm6,%xmm4
punpcklqdq %xmm6,%xmm5
movdqa %xmm2,%xmm3
psrlq $4,%xmm2
psrlq $30,%xmm3
movdqa %xmm5,%xmm6
psrlq $40,%xmm4
psrlq $26,%xmm6
pand %xmm7,%xmm5
pand %xmm7,%xmm6
pand %xmm7,%xmm2
pand %xmm7,%xmm3
por (%ebx),%xmm4
movdqa %xmm0,80(%esp)
movdqa %xmm1,96(%esp)
ja .L015loop
.L014skip_loop:
pshufd $16,-144(%edx),%xmm7
addl $32,%ecx
jnz .L016long_tail
paddd %xmm0,%xmm5
paddd %xmm1,%xmm6
paddd 112(%esp),%xmm2
paddd 128(%esp),%xmm3
paddd 144(%esp),%xmm4
.L016long_tail:
movdqa %xmm5,(%eax)
movdqa %xmm6,16(%eax)
movdqa %xmm2,32(%eax)
movdqa %xmm3,48(%eax)
movdqa %xmm4,64(%eax)
pmuludq %xmm7,%xmm5
pmuludq %xmm7,%xmm6
pmuludq %xmm7,%xmm2
movdqa %xmm5,%xmm0
pshufd $16,-128(%edx),%xmm5
pmuludq %xmm7,%xmm3
movdqa %xmm6,%xmm1
pmuludq %xmm7,%xmm4
movdqa %xmm5,%xmm6
pmuludq 48(%eax),%xmm5
movdqa %xmm6,%xmm7
pmuludq 32(%eax),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%eax),%xmm7
paddq %xmm6,%xmm3
pshufd $16,-64(%edx),%xmm6
pmuludq (%eax),%xmm5
paddq %xmm7,%xmm2
pmuludq 64(%eax),%xmm6
pshufd $16,-112(%edx),%xmm7
paddq %xmm5,%xmm1
movdqa %xmm7,%xmm5
pmuludq 32(%eax),%xmm7
paddq %xmm6,%xmm0
movdqa %xmm5,%xmm6
pmuludq 16(%eax),%xmm5
paddq %xmm7,%xmm4
pshufd $16,-48(%edx),%xmm7
pmuludq (%eax),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq 64(%eax),%xmm7
paddq %xmm6,%xmm2
pmuludq 48(%eax),%xmm5
pshufd $16,-96(%edx),%xmm6
paddq %xmm7,%xmm1
movdqa %xmm6,%xmm7
pmuludq 16(%eax),%xmm6
paddq %xmm5,%xmm0
pshufd $16,-32(%edx),%xmm5
pmuludq (%eax),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 64(%eax),%xmm5
paddq %xmm7,%xmm3
movdqa %xmm6,%xmm7
pmuludq 48(%eax),%xmm6
paddq %xmm5,%xmm2
pmuludq 32(%eax),%xmm7
pshufd $16,-80(%edx),%xmm5
paddq %xmm6,%xmm1
pshufd $16,-16(%edx),%xmm6
pmuludq (%eax),%xmm5
paddq %xmm7,%xmm0
movdqa %xmm6,%xmm7
pmuludq 64(%eax),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%eax),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq 32(%eax),%xmm5
paddq %xmm7,%xmm0
pmuludq 48(%eax),%xmm6
movdqa 64(%ebx),%xmm7
paddq %xmm5,%xmm1
paddq %xmm6,%xmm2
jz .L017short_tail
movdqu -32(%esi),%xmm5
movdqu -16(%esi),%xmm6
leal 32(%esi),%esi
movdqa %xmm2,32(%esp)
movdqa %xmm3,48(%esp)
movdqa %xmm4,64(%esp)
movdqa %xmm5,%xmm2
movdqa %xmm6,%xmm3
psrldq $6,%xmm2
psrldq $6,%xmm3
movdqa %xmm5,%xmm4
punpcklqdq %xmm3,%xmm2
punpckhqdq %xmm6,%xmm4
punpcklqdq %xmm6,%xmm5
movdqa %xmm2,%xmm3
psrlq $4,%xmm2
psrlq $30,%xmm3
movdqa %xmm5,%xmm6
psrlq $40,%xmm4
psrlq $26,%xmm6
pand %xmm7,%xmm5
pand %xmm7,%xmm6
pand %xmm7,%xmm2
pand %xmm7,%xmm3
por (%ebx),%xmm4
pshufd $16,(%edx),%xmm7
paddd 80(%esp),%xmm5
paddd 96(%esp),%xmm6
paddd 112(%esp),%xmm2
paddd 128(%esp),%xmm3
paddd 144(%esp),%xmm4
movdqa %xmm5,(%esp)
pmuludq %xmm7,%xmm5
movdqa %xmm6,16(%esp)
pmuludq %xmm7,%xmm6
paddq %xmm5,%xmm0
movdqa %xmm2,%xmm5
pmuludq %xmm7,%xmm2
paddq %xmm6,%xmm1
movdqa %xmm3,%xmm6
pmuludq %xmm7,%xmm3
paddq 32(%esp),%xmm2
movdqa %xmm5,32(%esp)
pshufd $16,16(%edx),%xmm5
paddq 48(%esp),%xmm3
movdqa %xmm6,48(%esp)
movdqa %xmm4,%xmm6
pmuludq %xmm7,%xmm4
paddq 64(%esp),%xmm4
movdqa %xmm6,64(%esp)
movdqa %xmm5,%xmm6
pmuludq 48(%esp),%xmm5
movdqa %xmm6,%xmm7
pmuludq 32(%esp),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%esp),%xmm7
paddq %xmm6,%xmm3
pshufd $16,80(%edx),%xmm6
pmuludq (%esp),%xmm5
paddq %xmm7,%xmm2
pmuludq 64(%esp),%xmm6
pshufd $16,32(%edx),%xmm7
paddq %xmm5,%xmm1
movdqa %xmm7,%xmm5
pmuludq 32(%esp),%xmm7
paddq %xmm6,%xmm0
movdqa %xmm5,%xmm6
pmuludq 16(%esp),%xmm5
paddq %xmm7,%xmm4
pshufd $16,96(%edx),%xmm7
pmuludq (%esp),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq 64(%esp),%xmm7
paddq %xmm6,%xmm2
pmuludq 48(%esp),%xmm5
pshufd $16,48(%edx),%xmm6
paddq %xmm7,%xmm1
movdqa %xmm6,%xmm7
pmuludq 16(%esp),%xmm6
paddq %xmm5,%xmm0
pshufd $16,112(%edx),%xmm5
pmuludq (%esp),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 64(%esp),%xmm5
paddq %xmm7,%xmm3
movdqa %xmm6,%xmm7
pmuludq 48(%esp),%xmm6
paddq %xmm5,%xmm2
pmuludq 32(%esp),%xmm7
pshufd $16,64(%edx),%xmm5
paddq %xmm6,%xmm1
pshufd $16,128(%edx),%xmm6
pmuludq (%esp),%xmm5
paddq %xmm7,%xmm0
movdqa %xmm6,%xmm7
pmuludq 64(%esp),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%esp),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq 32(%esp),%xmm5
paddq %xmm7,%xmm0
pmuludq 48(%esp),%xmm6
movdqa 64(%ebx),%xmm7
paddq %xmm5,%xmm1
paddq %xmm6,%xmm2
.L017short_tail:
pshufd $78,%xmm4,%xmm6
pshufd $78,%xmm3,%xmm5
paddq %xmm6,%xmm4
paddq %xmm5,%xmm3
pshufd $78,%xmm0,%xmm6
pshufd $78,%xmm1,%xmm5
paddq %xmm6,%xmm0
paddq %xmm5,%xmm1
pshufd $78,%xmm2,%xmm6
movdqa %xmm3,%xmm5
pand %xmm7,%xmm3
psrlq $26,%xmm5
paddq %xmm6,%xmm2
paddq %xmm4,%xmm5
movdqa %xmm0,%xmm6
pand %xmm7,%xmm0
psrlq $26,%xmm6
movdqa %xmm5,%xmm4
paddq %xmm1,%xmm6
psrlq $26,%xmm5
pand %xmm7,%xmm4
movdqa %xmm6,%xmm1
psrlq $26,%xmm6
paddd %xmm5,%xmm0
psllq $2,%xmm5
paddq %xmm2,%xmm6
paddq %xmm0,%xmm5
pand %xmm7,%xmm1
movdqa %xmm6,%xmm2
psrlq $26,%xmm6
pand %xmm7,%xmm2
paddd %xmm3,%xmm6
movdqa %xmm5,%xmm0
psrlq $26,%xmm5
movdqa %xmm6,%xmm3
psrlq $26,%xmm6
pand %xmm7,%xmm0
paddd %xmm5,%xmm1
pand %xmm7,%xmm3
paddd %xmm6,%xmm4
.L013done:
movd %xmm0,-48(%edi)
movd %xmm1,-44(%edi)
movd %xmm2,-40(%edi)
movd %xmm3,-36(%edi)
movd %xmm4,-32(%edi)
movl %ebp,%esp
.L007nodata:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size _poly1305_blocks_sse2,.-_poly1305_blocks_sse2
.align 32
.type _poly1305_emit_sse2,@function
.align 16
_poly1305_emit_sse2:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%ebp
cmpl $0,20(%ebp)
je .Lenter_emit
movl (%ebp),%eax
movl 4(%ebp),%edi
movl 8(%ebp),%ecx
movl 12(%ebp),%edx
movl 16(%ebp),%esi
movl %edi,%ebx
shll $26,%edi
shrl $6,%ebx
addl %edi,%eax
movl %ecx,%edi
adcl $0,%ebx
shll $20,%edi
shrl $12,%ecx
addl %edi,%ebx
movl %edx,%edi
adcl $0,%ecx
shll $14,%edi
shrl $18,%edx
addl %edi,%ecx
movl %esi,%edi
adcl $0,%edx
shll $8,%edi
shrl $24,%esi
addl %edi,%edx
adcl $0,%esi
movl %esi,%edi
andl $3,%esi
shrl $2,%edi
leal (%edi,%edi,4),%ebp
movl 24(%esp),%edi
addl %ebp,%eax
movl 28(%esp),%ebp
adcl $0,%ebx
adcl $0,%ecx
adcl $0,%edx
adcl $0,%esi
movd %eax,%xmm0
addl $5,%eax
movd %ebx,%xmm1
adcl $0,%ebx
movd %ecx,%xmm2
adcl $0,%ecx
movd %edx,%xmm3
adcl $0,%edx
adcl $0,%esi
shrl $2,%esi
negl %esi
andl %esi,%eax
andl %esi,%ebx
andl %esi,%ecx
andl %esi,%edx
movl %eax,(%edi)
movd %xmm0,%eax
movl %ebx,4(%edi)
movd %xmm1,%ebx
movl %ecx,8(%edi)
movd %xmm2,%ecx
movl %edx,12(%edi)
movd %xmm3,%edx
notl %esi
andl %esi,%eax
andl %esi,%ebx
orl (%edi),%eax
andl %esi,%ecx
orl 4(%edi),%ebx
andl %esi,%edx
orl 8(%edi),%ecx
orl 12(%edi),%edx
addl (%ebp),%eax
adcl 4(%ebp),%ebx
movl %eax,(%edi)
adcl 8(%ebp),%ecx
movl %ebx,4(%edi)
adcl 12(%ebp),%edx
movl %ecx,8(%edi)
movl %edx,12(%edi)
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size _poly1305_emit_sse2,.-_poly1305_emit_sse2
.align 32
.type _poly1305_init_avx2,@function
.align 16
_poly1305_init_avx2:
vmovdqu 24(%edi),%xmm4
leal 48(%edi),%edi
movl %esp,%ebp
subl $224,%esp
andl $-16,%esp
vmovdqa 64(%ebx),%xmm7
vpand %xmm7,%xmm4,%xmm0
vpsrlq $26,%xmm4,%xmm1
vpsrldq $6,%xmm4,%xmm3
vpand %xmm7,%xmm1,%xmm1
vpsrlq $4,%xmm3,%xmm2
vpsrlq $30,%xmm3,%xmm3
vpand %xmm7,%xmm2,%xmm2
vpand %xmm7,%xmm3,%xmm3
vpsrldq $13,%xmm4,%xmm4
leal 144(%esp),%edx
movl $2,%ecx
.L018square:
vmovdqa %xmm0,(%esp)
vmovdqa %xmm1,16(%esp)
vmovdqa %xmm2,32(%esp)
vmovdqa %xmm3,48(%esp)
vmovdqa %xmm4,64(%esp)
vpslld $2,%xmm1,%xmm6
vpslld $2,%xmm2,%xmm5
vpaddd %xmm1,%xmm6,%xmm6
vpaddd %xmm2,%xmm5,%xmm5
vmovdqa %xmm6,80(%esp)
vmovdqa %xmm5,96(%esp)
vpslld $2,%xmm3,%xmm6
vpslld $2,%xmm4,%xmm5
vpaddd %xmm3,%xmm6,%xmm6
vpaddd %xmm4,%xmm5,%xmm5
vmovdqa %xmm6,112(%esp)
vmovdqa %xmm5,128(%esp)
vpshufd $68,%xmm0,%xmm5
vmovdqa %xmm1,%xmm6
vpshufd $68,%xmm1,%xmm1
vpshufd $68,%xmm2,%xmm2
vpshufd $68,%xmm3,%xmm3
vpshufd $68,%xmm4,%xmm4
vmovdqa %xmm5,(%edx)
vmovdqa %xmm1,16(%edx)
vmovdqa %xmm2,32(%edx)
vmovdqa %xmm3,48(%edx)
vmovdqa %xmm4,64(%edx)
vpmuludq %xmm0,%xmm4,%xmm4
vpmuludq %xmm0,%xmm3,%xmm3
vpmuludq %xmm0,%xmm2,%xmm2
vpmuludq %xmm0,%xmm1,%xmm1
vpmuludq %xmm0,%xmm5,%xmm0
vpmuludq 48(%edx),%xmm6,%xmm5
vpaddq %xmm5,%xmm4,%xmm4
vpmuludq 32(%edx),%xmm6,%xmm7
vpaddq %xmm7,%xmm3,%xmm3
vpmuludq 16(%edx),%xmm6,%xmm5
vpaddq %xmm5,%xmm2,%xmm2
vmovdqa 80(%esp),%xmm7
vpmuludq (%edx),%xmm6,%xmm6
vpaddq %xmm6,%xmm1,%xmm1
vmovdqa 32(%esp),%xmm5
vpmuludq 64(%edx),%xmm7,%xmm7
vpaddq %xmm7,%xmm0,%xmm0
vpmuludq 32(%edx),%xmm5,%xmm6
vpaddq %xmm6,%xmm4,%xmm4
vpmuludq 16(%edx),%xmm5,%xmm7
vpaddq %xmm7,%xmm3,%xmm3
vmovdqa 96(%esp),%xmm6
vpmuludq (%edx),%xmm5,%xmm5
vpaddq %xmm5,%xmm2,%xmm2
vpmuludq 64(%edx),%xmm6,%xmm7
vpaddq %xmm7,%xmm1,%xmm1
vmovdqa 48(%esp),%xmm5
vpmuludq 48(%edx),%xmm6,%xmm6
vpaddq %xmm6,%xmm0,%xmm0
vpmuludq 16(%edx),%xmm5,%xmm7
vpaddq %xmm7,%xmm4,%xmm4
vmovdqa 112(%esp),%xmm6
vpmuludq (%edx),%xmm5,%xmm5
vpaddq %xmm5,%xmm3,%xmm3
vpmuludq 64(%edx),%xmm6,%xmm7
vpaddq %xmm7,%xmm2,%xmm2
vpmuludq 48(%edx),%xmm6,%xmm5
vpaddq %xmm5,%xmm1,%xmm1
vmovdqa 64(%esp),%xmm7
vpmuludq 32(%edx),%xmm6,%xmm6
vpaddq %xmm6,%xmm0,%xmm0
vmovdqa 128(%esp),%xmm5
vpmuludq (%edx),%xmm7,%xmm7
vpaddq %xmm7,%xmm4,%xmm4
vpmuludq 64(%edx),%xmm5,%xmm6
vpaddq %xmm6,%xmm3,%xmm3
vpmuludq 16(%edx),%xmm5,%xmm7
vpaddq %xmm7,%xmm0,%xmm0
vpmuludq 32(%edx),%xmm5,%xmm6
vpaddq %xmm6,%xmm1,%xmm1
vmovdqa 64(%ebx),%xmm7
vpmuludq 48(%edx),%xmm5,%xmm5
vpaddq %xmm5,%xmm2,%xmm2
vpsrlq $26,%xmm3,%xmm5
vpand %xmm7,%xmm3,%xmm3
vpsrlq $26,%xmm0,%xmm6
vpand %xmm7,%xmm0,%xmm0
vpaddq %xmm5,%xmm4,%xmm4
vpaddq %xmm6,%xmm1,%xmm1
vpsrlq $26,%xmm4,%xmm5
vpand %xmm7,%xmm4,%xmm4
vpsrlq $26,%xmm1,%xmm6
vpand %xmm7,%xmm1,%xmm1
vpaddq %xmm6,%xmm2,%xmm2
vpaddd %xmm5,%xmm0,%xmm0
vpsllq $2,%xmm5,%xmm5
vpsrlq $26,%xmm2,%xmm6
vpand %xmm7,%xmm2,%xmm2
vpaddd %xmm5,%xmm0,%xmm0
vpaddd %xmm6,%xmm3,%xmm3
vpsrlq $26,%xmm3,%xmm6
vpsrlq $26,%xmm0,%xmm5
vpand %xmm7,%xmm0,%xmm0
vpand %xmm7,%xmm3,%xmm3
vpaddd %xmm5,%xmm1,%xmm1
vpaddd %xmm6,%xmm4,%xmm4
decl %ecx
jz .L019square_break
vpunpcklqdq (%esp),%xmm0,%xmm0
vpunpcklqdq 16(%esp),%xmm1,%xmm1
vpunpcklqdq 32(%esp),%xmm2,%xmm2
vpunpcklqdq 48(%esp),%xmm3,%xmm3
vpunpcklqdq 64(%esp),%xmm4,%xmm4
jmp .L018square
.L019square_break:
vpsllq $32,%xmm0,%xmm0
vpsllq $32,%xmm1,%xmm1
vpsllq $32,%xmm2,%xmm2
vpsllq $32,%xmm3,%xmm3
vpsllq $32,%xmm4,%xmm4
vpor (%esp),%xmm0,%xmm0
vpor 16(%esp),%xmm1,%xmm1
vpor 32(%esp),%xmm2,%xmm2
vpor 48(%esp),%xmm3,%xmm3
vpor 64(%esp),%xmm4,%xmm4
vpshufd $141,%xmm0,%xmm0
vpshufd $141,%xmm1,%xmm1
vpshufd $141,%xmm2,%xmm2
vpshufd $141,%xmm3,%xmm3
vpshufd $141,%xmm4,%xmm4
vmovdqu %xmm0,(%edi)
vmovdqu %xmm1,16(%edi)
vmovdqu %xmm2,32(%edi)
vmovdqu %xmm3,48(%edi)
vmovdqu %xmm4,64(%edi)
vpslld $2,%xmm1,%xmm6
vpslld $2,%xmm2,%xmm5
vpaddd %xmm1,%xmm6,%xmm6
vpaddd %xmm2,%xmm5,%xmm5
vmovdqu %xmm6,80(%edi)
vmovdqu %xmm5,96(%edi)
vpslld $2,%xmm3,%xmm6
vpslld $2,%xmm4,%xmm5
vpaddd %xmm3,%xmm6,%xmm6
vpaddd %xmm4,%xmm5,%xmm5
vmovdqu %xmm6,112(%edi)
vmovdqu %xmm5,128(%edi)
movl %ebp,%esp
leal -48(%edi),%edi
ret
.size _poly1305_init_avx2,.-_poly1305_init_avx2
.align 32
.type _poly1305_blocks_avx2,@function
.align 16
_poly1305_blocks_avx2:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ecx
movl 20(%edi),%eax
andl $-16,%ecx
jz .L020nodata
cmpl $64,%ecx
jae .L021enter_avx2
testl %eax,%eax
jz .Lenter_blocks
.L021enter_avx2:
vzeroupper
call .L022pic_point
.L022pic_point:
popl %ebx
leal .Lconst_sse2-.L022pic_point(%ebx),%ebx
testl %eax,%eax
jnz .L023base2_26
call _poly1305_init_avx2
movl (%edi),%eax
movl 3(%edi),%ecx
movl 6(%edi),%edx
movl 9(%edi),%esi
movl 13(%edi),%ebp
shrl $2,%ecx
andl $67108863,%eax
shrl $4,%edx
andl $67108863,%ecx
shrl $6,%esi
andl $67108863,%edx
movl %eax,(%edi)
movl %ecx,4(%edi)
movl %edx,8(%edi)
movl %esi,12(%edi)
movl %ebp,16(%edi)
movl $1,20(%edi)
movl 24(%esp),%esi
movl 28(%esp),%ecx
.L023base2_26:
movl 32(%esp),%eax
movl %esp,%ebp
subl $448,%esp
andl $-512,%esp
vmovdqu 48(%edi),%xmm0
leal 288(%esp),%edx
vmovdqu 64(%edi),%xmm1
vmovdqu 80(%edi),%xmm2
vmovdqu 96(%edi),%xmm3
vmovdqu 112(%edi),%xmm4
leal 48(%edi),%edi
vpermq $64,%ymm0,%ymm0
vpermq $64,%ymm1,%ymm1
vpermq $64,%ymm2,%ymm2
vpermq $64,%ymm3,%ymm3
vpermq $64,%ymm4,%ymm4
vpshufd $200,%ymm0,%ymm0
vpshufd $200,%ymm1,%ymm1
vpshufd $200,%ymm2,%ymm2
vpshufd $200,%ymm3,%ymm3
vpshufd $200,%ymm4,%ymm4
vmovdqa %ymm0,-128(%edx)
vmovdqu 80(%edi),%xmm0
vmovdqa %ymm1,-96(%edx)
vmovdqu 96(%edi),%xmm1
vmovdqa %ymm2,-64(%edx)
vmovdqu 112(%edi),%xmm2
vmovdqa %ymm3,-32(%edx)
vmovdqu 128(%edi),%xmm3
vmovdqa %ymm4,(%edx)
vpermq $64,%ymm0,%ymm0
vpermq $64,%ymm1,%ymm1
vpermq $64,%ymm2,%ymm2
vpermq $64,%ymm3,%ymm3
vpshufd $200,%ymm0,%ymm0
vpshufd $200,%ymm1,%ymm1
vpshufd $200,%ymm2,%ymm2
vpshufd $200,%ymm3,%ymm3
vmovdqa %ymm0,32(%edx)
vmovd -48(%edi),%xmm0
vmovdqa %ymm1,64(%edx)
vmovd -44(%edi),%xmm1
vmovdqa %ymm2,96(%edx)
vmovd -40(%edi),%xmm2
vmovdqa %ymm3,128(%edx)
vmovd -36(%edi),%xmm3
vmovd -32(%edi),%xmm4
vmovdqa 64(%ebx),%ymm7
negl %eax
testl $63,%ecx
jz .L024even
movl %ecx,%edx
andl $-64,%ecx
andl $63,%edx
vmovdqu (%esi),%xmm5
cmpl $32,%edx
jb .L025one
vmovdqu 16(%esi),%xmm6
je .L026two
vinserti128 $1,32(%esi),%ymm5,%ymm5
leal 48(%esi),%esi
leal 8(%ebx),%ebx
leal 296(%esp),%edx
jmp .L027tail
.L026two:
leal 32(%esi),%esi
leal 16(%ebx),%ebx
leal 304(%esp),%edx
jmp .L027tail
.L025one:
leal 16(%esi),%esi
vpxor %ymm6,%ymm6,%ymm6
leal 32(%ebx,%eax,8),%ebx
leal 312(%esp),%edx
jmp .L027tail
.align 32
.L024even:
vmovdqu (%esi),%xmm5
vmovdqu 16(%esi),%xmm6
vinserti128 $1,32(%esi),%ymm5,%ymm5
vinserti128 $1,48(%esi),%ymm6,%ymm6
leal 64(%esi),%esi
subl $64,%ecx
jz .L027tail
.L028loop:
vmovdqa %ymm2,64(%esp)
vpsrldq $6,%ymm5,%ymm2
vmovdqa %ymm0,(%esp)
vpsrldq $6,%ymm6,%ymm0
vmovdqa %ymm1,32(%esp)
vpunpckhqdq %ymm6,%ymm5,%ymm1
vpunpcklqdq %ymm6,%ymm5,%ymm5
vpunpcklqdq %ymm0,%ymm2,%ymm2
vpsrlq $30,%ymm2,%ymm0
vpsrlq $4,%ymm2,%ymm2
vpsrlq $26,%ymm5,%ymm6
vpsrlq $40,%ymm1,%ymm1
vpand %ymm7,%ymm2,%ymm2
vpand %ymm7,%ymm5,%ymm5
vpand %ymm7,%ymm6,%ymm6
vpand %ymm7,%ymm0,%ymm0
vpor (%ebx),%ymm1,%ymm1
vpaddq 64(%esp),%ymm2,%ymm2
vpaddq (%esp),%ymm5,%ymm5
vpaddq 32(%esp),%ymm6,%ymm6
vpaddq %ymm3,%ymm0,%ymm0
vpaddq %ymm4,%ymm1,%ymm1
vpmuludq -96(%edx),%ymm2,%ymm3
vmovdqa %ymm6,32(%esp)
vpmuludq -64(%edx),%ymm2,%ymm4
vmovdqa %ymm0,96(%esp)
vpmuludq 96(%edx),%ymm2,%ymm0
vmovdqa %ymm1,128(%esp)
vpmuludq 128(%edx),%ymm2,%ymm1
vpmuludq -128(%edx),%ymm2,%ymm2
vpmuludq -32(%edx),%ymm5,%ymm7
vpaddq %ymm7,%ymm3,%ymm3
vpmuludq (%edx),%ymm5,%ymm6
vpaddq %ymm6,%ymm4,%ymm4
vpmuludq -128(%edx),%ymm5,%ymm7
vpaddq %ymm7,%ymm0,%ymm0
vmovdqa 32(%esp),%ymm7
vpmuludq -96(%edx),%ymm5,%ymm6
vpaddq %ymm6,%ymm1,%ymm1
vpmuludq -64(%edx),%ymm5,%ymm5
vpaddq %ymm5,%ymm2,%ymm2
vpmuludq -64(%edx),%ymm7,%ymm6
vpaddq %ymm6,%ymm3,%ymm3
vpmuludq -32(%edx),%ymm7,%ymm5
vpaddq %ymm5,%ymm4,%ymm4
vpmuludq 128(%edx),%ymm7,%ymm6
vpaddq %ymm6,%ymm0,%ymm0
vmovdqa 96(%esp),%ymm6
vpmuludq -128(%edx),%ymm7,%ymm5
vpaddq %ymm5,%ymm1,%ymm1
vpmuludq -96(%edx),%ymm7,%ymm7
vpaddq %ymm7,%ymm2,%ymm2
vpmuludq -128(%edx),%ymm6,%ymm5
vpaddq %ymm5,%ymm3,%ymm3
vpmuludq -96(%edx),%ymm6,%ymm7
vpaddq %ymm7,%ymm4,%ymm4
vpmuludq 64(%edx),%ymm6,%ymm5
vpaddq %ymm5,%ymm0,%ymm0
vmovdqa 128(%esp),%ymm5
vpmuludq 96(%edx),%ymm6,%ymm7
vpaddq %ymm7,%ymm1,%ymm1
vpmuludq 128(%edx),%ymm6,%ymm6
vpaddq %ymm6,%ymm2,%ymm2
vpmuludq 128(%edx),%ymm5,%ymm7
vpaddq %ymm7,%ymm3,%ymm3
vpmuludq 32(%edx),%ymm5,%ymm6
vpaddq %ymm6,%ymm0,%ymm0
vpmuludq -128(%edx),%ymm5,%ymm7
vpaddq %ymm7,%ymm4,%ymm4
vmovdqa 64(%ebx),%ymm7
vpmuludq 64(%edx),%ymm5,%ymm6
vpaddq %ymm6,%ymm1,%ymm1
vpmuludq 96(%edx),%ymm5,%ymm5
vpaddq %ymm5,%ymm2,%ymm2
vpsrlq $26,%ymm3,%ymm5
vpand %ymm7,%ymm3,%ymm3
vpsrlq $26,%ymm0,%ymm6
vpand %ymm7,%ymm0,%ymm0
vpaddq %ymm5,%ymm4,%ymm4
vpaddq %ymm6,%ymm1,%ymm1
vpsrlq $26,%ymm4,%ymm5
vpand %ymm7,%ymm4,%ymm4
vpsrlq $26,%ymm1,%ymm6
vpand %ymm7,%ymm1,%ymm1
vpaddq %ymm6,%ymm2,%ymm2
vpaddq %ymm5,%ymm0,%ymm0
vpsllq $2,%ymm5,%ymm5
vpsrlq $26,%ymm2,%ymm6
vpand %ymm7,%ymm2,%ymm2
vpaddq %ymm5,%ymm0,%ymm0
vpaddq %ymm6,%ymm3,%ymm3
vpsrlq $26,%ymm3,%ymm6
vpsrlq $26,%ymm0,%ymm5
vpand %ymm7,%ymm0,%ymm0
vpand %ymm7,%ymm3,%ymm3
vpaddq %ymm5,%ymm1,%ymm1
vpaddq %ymm6,%ymm4,%ymm4
vmovdqu (%esi),%xmm5
vmovdqu 16(%esi),%xmm6
vinserti128 $1,32(%esi),%ymm5,%ymm5
vinserti128 $1,48(%esi),%ymm6,%ymm6
leal 64(%esi),%esi
subl $64,%ecx
jnz .L028loop
.L027tail:
vmovdqa %ymm2,64(%esp)
vpsrldq $6,%ymm5,%ymm2
vmovdqa %ymm0,(%esp)
vpsrldq $6,%ymm6,%ymm0
vmovdqa %ymm1,32(%esp)
vpunpckhqdq %ymm6,%ymm5,%ymm1
vpunpcklqdq %ymm6,%ymm5,%ymm5
vpunpcklqdq %ymm0,%ymm2,%ymm2
vpsrlq $30,%ymm2,%ymm0
vpsrlq $4,%ymm2,%ymm2
vpsrlq $26,%ymm5,%ymm6
vpsrlq $40,%ymm1,%ymm1
vpand %ymm7,%ymm2,%ymm2
vpand %ymm7,%ymm5,%ymm5
vpand %ymm7,%ymm6,%ymm6
vpand %ymm7,%ymm0,%ymm0
vpor (%ebx),%ymm1,%ymm1
andl $-64,%ebx
vpaddq 64(%esp),%ymm2,%ymm2
vpaddq (%esp),%ymm5,%ymm5
vpaddq 32(%esp),%ymm6,%ymm6
vpaddq %ymm3,%ymm0,%ymm0
vpaddq %ymm4,%ymm1,%ymm1
vpmuludq -92(%edx),%ymm2,%ymm3
vmovdqa %ymm6,32(%esp)
vpmuludq -60(%edx),%ymm2,%ymm4
vmovdqa %ymm0,96(%esp)
vpmuludq 100(%edx),%ymm2,%ymm0
vmovdqa %ymm1,128(%esp)
vpmuludq 132(%edx),%ymm2,%ymm1
vpmuludq -124(%edx),%ymm2,%ymm2
vpmuludq -28(%edx),%ymm5,%ymm7
vpaddq %ymm7,%ymm3,%ymm3
vpmuludq 4(%edx),%ymm5,%ymm6
vpaddq %ymm6,%ymm4,%ymm4
vpmuludq -124(%edx),%ymm5,%ymm7
vpaddq %ymm7,%ymm0,%ymm0
vmovdqa 32(%esp),%ymm7
vpmuludq -92(%edx),%ymm5,%ymm6
vpaddq %ymm6,%ymm1,%ymm1
vpmuludq -60(%edx),%ymm5,%ymm5
vpaddq %ymm5,%ymm2,%ymm2
vpmuludq -60(%edx),%ymm7,%ymm6
vpaddq %ymm6,%ymm3,%ymm3
vpmuludq -28(%edx),%ymm7,%ymm5
vpaddq %ymm5,%ymm4,%ymm4
vpmuludq 132(%edx),%ymm7,%ymm6
vpaddq %ymm6,%ymm0,%ymm0
vmovdqa 96(%esp),%ymm6
vpmuludq -124(%edx),%ymm7,%ymm5
vpaddq %ymm5,%ymm1,%ymm1
vpmuludq -92(%edx),%ymm7,%ymm7
vpaddq %ymm7,%ymm2,%ymm2
vpmuludq -124(%edx),%ymm6,%ymm5
vpaddq %ymm5,%ymm3,%ymm3
vpmuludq -92(%edx),%ymm6,%ymm7
vpaddq %ymm7,%ymm4,%ymm4
vpmuludq 68(%edx),%ymm6,%ymm5
vpaddq %ymm5,%ymm0,%ymm0
vmovdqa 128(%esp),%ymm5
vpmuludq 100(%edx),%ymm6,%ymm7
vpaddq %ymm7,%ymm1,%ymm1
vpmuludq 132(%edx),%ymm6,%ymm6
vpaddq %ymm6,%ymm2,%ymm2
vpmuludq 132(%edx),%ymm5,%ymm7
vpaddq %ymm7,%ymm3,%ymm3
vpmuludq 36(%edx),%ymm5,%ymm6
vpaddq %ymm6,%ymm0,%ymm0
vpmuludq -124(%edx),%ymm5,%ymm7
vpaddq %ymm7,%ymm4,%ymm4
vmovdqa 64(%ebx),%ymm7
vpmuludq 68(%edx),%ymm5,%ymm6
vpaddq %ymm6,%ymm1,%ymm1
vpmuludq 100(%edx),%ymm5,%ymm5
vpaddq %ymm5,%ymm2,%ymm2
vpsrldq $8,%ymm4,%ymm5
vpsrldq $8,%ymm3,%ymm6
vpaddq %ymm5,%ymm4,%ymm4
vpsrldq $8,%ymm0,%ymm5
vpaddq %ymm6,%ymm3,%ymm3
vpsrldq $8,%ymm1,%ymm6
vpaddq %ymm5,%ymm0,%ymm0
vpsrldq $8,%ymm2,%ymm5
vpaddq %ymm6,%ymm1,%ymm1
vpermq $2,%ymm4,%ymm6
vpaddq %ymm5,%ymm2,%ymm2
vpermq $2,%ymm3,%ymm5
vpaddq %ymm6,%ymm4,%ymm4
vpermq $2,%ymm0,%ymm6
vpaddq %ymm5,%ymm3,%ymm3
vpermq $2,%ymm1,%ymm5
vpaddq %ymm6,%ymm0,%ymm0
vpermq $2,%ymm2,%ymm6
vpaddq %ymm5,%ymm1,%ymm1
vpaddq %ymm6,%ymm2,%ymm2
vpsrlq $26,%ymm3,%ymm5
vpand %ymm7,%ymm3,%ymm3
vpsrlq $26,%ymm0,%ymm6
vpand %ymm7,%ymm0,%ymm0
vpaddq %ymm5,%ymm4,%ymm4
vpaddq %ymm6,%ymm1,%ymm1
vpsrlq $26,%ymm4,%ymm5
vpand %ymm7,%ymm4,%ymm4
vpsrlq $26,%ymm1,%ymm6
vpand %ymm7,%ymm1,%ymm1
vpaddq %ymm6,%ymm2,%ymm2
vpaddq %ymm5,%ymm0,%ymm0
vpsllq $2,%ymm5,%ymm5
vpsrlq $26,%ymm2,%ymm6
vpand %ymm7,%ymm2,%ymm2
vpaddq %ymm5,%ymm0,%ymm0
vpaddq %ymm6,%ymm3,%ymm3
vpsrlq $26,%ymm3,%ymm6
vpsrlq $26,%ymm0,%ymm5
vpand %ymm7,%ymm0,%ymm0
vpand %ymm7,%ymm3,%ymm3
vpaddq %ymm5,%ymm1,%ymm1
vpaddq %ymm6,%ymm4,%ymm4
cmpl $0,%ecx
je .L029done
vpshufd $252,%xmm0,%xmm0
leal 288(%esp),%edx
vpshufd $252,%xmm1,%xmm1
vpshufd $252,%xmm2,%xmm2
vpshufd $252,%xmm3,%xmm3
vpshufd $252,%xmm4,%xmm4
jmp .L024even
.align 16
.L029done:
vmovd %xmm0,-48(%edi)
vmovd %xmm1,-44(%edi)
vmovd %xmm2,-40(%edi)
vmovd %xmm3,-36(%edi)
vmovd %xmm4,-32(%edi)
vzeroupper
movl %ebp,%esp
.L020nodata:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2
.align 64
.Lconst_sse2:
.long 16777216,0,16777216,0,16777216,0,16777216,0
.long 0,0,0,0,0,0,0,0
.long 67108863,0,67108863,0,67108863,0,67108863,0
.long 268435455,268435452,268435452,268435452
.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
.byte 114,103,62,0
.align 4
.comm OPENSSL_ia32cap_P,16,4
#else
.text
.align 64
.globl poly1305_init
.type poly1305_init,@function
.align 16
poly1305_init:
.L_poly1305_init_begin:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ebp
xorl %eax,%eax
movl %eax,(%edi)
movl %eax,4(%edi)
movl %eax,8(%edi)
movl %eax,12(%edi)
movl %eax,16(%edi)
movl %eax,20(%edi)
cmpl $0,%esi
je .L000nokey
call .L001pic_point
.L001pic_point:
popl %ebx
leal poly1305_blocks-.L001pic_point(%ebx),%eax
leal poly1305_emit-.L001pic_point(%ebx),%edx
leal OPENSSL_ia32cap_P,%edi
movl (%edi),%ecx
andl $83886080,%ecx
cmpl $83886080,%ecx
jne .L002no_sse2
leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx
movl 8(%edi),%ecx
testl $32,%ecx
jz .L002no_sse2
leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
.L002no_sse2:
movl 20(%esp),%edi
movl %eax,(%ebp)
movl %edx,4(%ebp)
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
movl 12(%esi),%edx
andl $268435455,%eax
andl $268435452,%ebx
andl $268435452,%ecx
andl $268435452,%edx
movl %eax,24(%edi)
movl %ebx,28(%edi)
movl %ecx,32(%edi)
movl %edx,36(%edi)
movl $1,%eax
.L000nokey:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size poly1305_init,.-.L_poly1305_init_begin
.globl poly1305_blocks
.type poly1305_blocks,@function
.align 16
poly1305_blocks:
.L_poly1305_blocks_begin:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ecx
.Lenter_blocks:
andl $-15,%ecx
jz .L003nodata
subl $64,%esp
movl 24(%edi),%eax
movl 28(%edi),%ebx
leal (%esi,%ecx,1),%ebp
movl 32(%edi),%ecx
movl 36(%edi),%edx
movl %ebp,92(%esp)
movl %esi,%ebp
movl %eax,36(%esp)
movl %ebx,%eax
shrl $2,%eax
movl %ebx,40(%esp)
addl %ebx,%eax
movl %ecx,%ebx
shrl $2,%ebx
movl %ecx,44(%esp)
addl %ecx,%ebx
movl %edx,%ecx
shrl $2,%ecx
movl %edx,48(%esp)
addl %edx,%ecx
movl %eax,52(%esp)
movl %ebx,56(%esp)
movl %ecx,60(%esp)
movl (%edi),%eax
movl 4(%edi),%ebx
movl 8(%edi),%ecx
movl 12(%edi),%esi
movl 16(%edi),%edi
jmp .L004loop
.align 32
.L004loop:
addl (%ebp),%eax
adcl 4(%ebp),%ebx
adcl 8(%ebp),%ecx
adcl 12(%ebp),%esi
leal 16(%ebp),%ebp
adcl 96(%esp),%edi
movl %eax,(%esp)
movl %esi,12(%esp)
mull 36(%esp)
movl %edi,16(%esp)
movl %eax,%edi
movl %ebx,%eax
movl %edx,%esi
mull 60(%esp)
addl %eax,%edi
movl %ecx,%eax
adcl %edx,%esi
mull 56(%esp)
addl %eax,%edi
movl 12(%esp),%eax
adcl %edx,%esi
mull 52(%esp)
addl %eax,%edi
movl (%esp),%eax
adcl %edx,%esi
mull 40(%esp)
movl %edi,20(%esp)
xorl %edi,%edi
addl %eax,%esi
movl %ebx,%eax
adcl %edx,%edi
mull 36(%esp)
addl %eax,%esi
movl %ecx,%eax
adcl %edx,%edi
mull 60(%esp)
addl %eax,%esi
movl 12(%esp),%eax
adcl %edx,%edi
mull 56(%esp)
addl %eax,%esi
movl 16(%esp),%eax
adcl %edx,%edi
imull 52(%esp),%eax
addl %eax,%esi
movl (%esp),%eax
adcl $0,%edi
mull 44(%esp)
movl %esi,24(%esp)
xorl %esi,%esi
addl %eax,%edi
movl %ebx,%eax
adcl %edx,%esi
mull 40(%esp)
addl %eax,%edi
movl %ecx,%eax
adcl %edx,%esi
mull 36(%esp)
addl %eax,%edi
movl 12(%esp),%eax
adcl %edx,%esi
mull 60(%esp)
addl %eax,%edi
movl 16(%esp),%eax
adcl %edx,%esi
imull 56(%esp),%eax
addl %eax,%edi
movl (%esp),%eax
adcl $0,%esi
mull 48(%esp)
movl %edi,28(%esp)
xorl %edi,%edi
addl %eax,%esi
movl %ebx,%eax
adcl %edx,%edi
mull 44(%esp)
addl %eax,%esi
movl %ecx,%eax
adcl %edx,%edi
mull 40(%esp)
addl %eax,%esi
movl 12(%esp),%eax
adcl %edx,%edi
mull 36(%esp)
addl %eax,%esi
movl 16(%esp),%ecx
adcl %edx,%edi
movl %ecx,%edx
imull 60(%esp),%ecx
addl %ecx,%esi
movl 20(%esp),%eax
adcl $0,%edi
imull 36(%esp),%edx
addl %edi,%edx
movl 24(%esp),%ebx
movl 28(%esp),%ecx
movl %edx,%edi
shrl $2,%edx
andl $3,%edi
leal (%edx,%edx,4),%edx
addl %edx,%eax
adcl $0,%ebx
adcl $0,%ecx
adcl $0,%esi
adcl $0,%edi
cmpl 92(%esp),%ebp
jne .L004loop
movl 84(%esp),%edx
addl $64,%esp
movl %eax,(%edx)
movl %ebx,4(%edx)
movl %ecx,8(%edx)
movl %esi,12(%edx)
movl %edi,16(%edx)
.L003nodata:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size poly1305_blocks,.-.L_poly1305_blocks_begin
.globl poly1305_emit
.type poly1305_emit,@function
.align 16
poly1305_emit:
.L_poly1305_emit_begin:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%ebp
.Lenter_emit:
movl 24(%esp),%edi
movl (%ebp),%eax
movl 4(%ebp),%ebx
movl 8(%ebp),%ecx
movl 12(%ebp),%edx
movl 16(%ebp),%esi
addl $5,%eax
adcl $0,%ebx
adcl $0,%ecx
adcl $0,%edx
adcl $0,%esi
shrl $2,%esi
negl %esi
andl %esi,%eax
andl %esi,%ebx
andl %esi,%ecx
andl %esi,%edx
movl %eax,(%edi)
movl %ebx,4(%edi)
movl %ecx,8(%edi)
movl %edx,12(%edi)
notl %esi
movl (%ebp),%eax
movl 4(%ebp),%ebx
movl 8(%ebp),%ecx
movl 12(%ebp),%edx
movl 28(%esp),%ebp
andl %esi,%eax
andl %esi,%ebx
andl %esi,%ecx
andl %esi,%edx
orl (%edi),%eax
orl 4(%edi),%ebx
orl 8(%edi),%ecx
orl 12(%edi),%edx
addl (%ebp),%eax
adcl 4(%ebp),%ebx
adcl 8(%ebp),%ecx
adcl 12(%ebp),%edx
movl %eax,(%edi)
movl %ebx,4(%edi)
movl %ecx,8(%edi)
movl %edx,12(%edi)
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size poly1305_emit,.-.L_poly1305_emit_begin
.align 32
.type _poly1305_init_sse2,@function
.align 16
_poly1305_init_sse2:
movdqu 24(%edi),%xmm4
leal 48(%edi),%edi
movl %esp,%ebp
subl $224,%esp
andl $-16,%esp
movq 64(%ebx),%xmm7
movdqa %xmm4,%xmm0
movdqa %xmm4,%xmm1
movdqa %xmm4,%xmm2
pand %xmm7,%xmm0
psrlq $26,%xmm1
psrldq $6,%xmm2
pand %xmm7,%xmm1
movdqa %xmm2,%xmm3
psrlq $4,%xmm2
psrlq $30,%xmm3
pand %xmm7,%xmm2
pand %xmm7,%xmm3
psrldq $13,%xmm4
leal 144(%esp),%edx
movl $2,%ecx
.L005square:
movdqa %xmm0,(%esp)
movdqa %xmm1,16(%esp)
movdqa %xmm2,32(%esp)
movdqa %xmm3,48(%esp)
movdqa %xmm4,64(%esp)
movdqa %xmm1,%xmm6
movdqa %xmm2,%xmm5
pslld $2,%xmm6
pslld $2,%xmm5
paddd %xmm1,%xmm6
paddd %xmm2,%xmm5
movdqa %xmm6,80(%esp)
movdqa %xmm5,96(%esp)
movdqa %xmm3,%xmm6
movdqa %xmm4,%xmm5
pslld $2,%xmm6
pslld $2,%xmm5
paddd %xmm3,%xmm6
paddd %xmm4,%xmm5
movdqa %xmm6,112(%esp)
movdqa %xmm5,128(%esp)
pshufd $68,%xmm0,%xmm6
movdqa %xmm1,%xmm5
pshufd $68,%xmm1,%xmm1
pshufd $68,%xmm2,%xmm2
pshufd $68,%xmm3,%xmm3
pshufd $68,%xmm4,%xmm4
movdqa %xmm6,(%edx)
movdqa %xmm1,16(%edx)
movdqa %xmm2,32(%edx)
movdqa %xmm3,48(%edx)
movdqa %xmm4,64(%edx)
pmuludq %xmm0,%xmm4
pmuludq %xmm0,%xmm3
pmuludq %xmm0,%xmm2
pmuludq %xmm0,%xmm1
pmuludq %xmm6,%xmm0
movdqa %xmm5,%xmm6
pmuludq 48(%edx),%xmm5
movdqa %xmm6,%xmm7
pmuludq 32(%edx),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%edx),%xmm7
paddq %xmm6,%xmm3
movdqa 80(%esp),%xmm6
pmuludq (%edx),%xmm5
paddq %xmm7,%xmm2
pmuludq 64(%edx),%xmm6
movdqa 32(%esp),%xmm7
paddq %xmm5,%xmm1
movdqa %xmm7,%xmm5
pmuludq 32(%edx),%xmm7
paddq %xmm6,%xmm0
movdqa %xmm5,%xmm6
pmuludq 16(%edx),%xmm5
paddq %xmm7,%xmm4
movdqa 96(%esp),%xmm7
pmuludq (%edx),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq 64(%edx),%xmm7
paddq %xmm6,%xmm2
pmuludq 48(%edx),%xmm5
movdqa 48(%esp),%xmm6
paddq %xmm7,%xmm1
movdqa %xmm6,%xmm7
pmuludq 16(%edx),%xmm6
paddq %xmm5,%xmm0
movdqa 112(%esp),%xmm5
pmuludq (%edx),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 64(%edx),%xmm5
paddq %xmm7,%xmm3
movdqa %xmm6,%xmm7
pmuludq 48(%edx),%xmm6
paddq %xmm5,%xmm2
pmuludq 32(%edx),%xmm7
movdqa 64(%esp),%xmm5
paddq %xmm6,%xmm1
movdqa 128(%esp),%xmm6
pmuludq (%edx),%xmm5
paddq %xmm7,%xmm0
movdqa %xmm6,%xmm7
pmuludq 64(%edx),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%edx),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq 32(%edx),%xmm5
paddq %xmm7,%xmm0
pmuludq 48(%edx),%xmm6
movdqa 64(%ebx),%xmm7
paddq %xmm5,%xmm1
paddq %xmm6,%xmm2
movdqa %xmm3,%xmm5
pand %xmm7,%xmm3
psrlq $26,%xmm5
paddq %xmm4,%xmm5
movdqa %xmm0,%xmm6
pand %xmm7,%xmm0
psrlq $26,%xmm6
movdqa %xmm5,%xmm4
paddq %xmm1,%xmm6
psrlq $26,%xmm5
pand %xmm7,%xmm4
movdqa %xmm6,%xmm1
psrlq $26,%xmm6
paddd %xmm5,%xmm0
psllq $2,%xmm5
paddq %xmm2,%xmm6
paddq %xmm0,%xmm5
pand %xmm7,%xmm1
movdqa %xmm6,%xmm2
psrlq $26,%xmm6
pand %xmm7,%xmm2
paddd %xmm3,%xmm6
movdqa %xmm5,%xmm0
psrlq $26,%xmm5
movdqa %xmm6,%xmm3
psrlq $26,%xmm6
pand %xmm7,%xmm0
paddd %xmm5,%xmm1
pand %xmm7,%xmm3
paddd %xmm6,%xmm4
decl %ecx
jz .L006square_break
punpcklqdq (%esp),%xmm0
punpcklqdq 16(%esp),%xmm1
punpcklqdq 32(%esp),%xmm2
punpcklqdq 48(%esp),%xmm3
punpcklqdq 64(%esp),%xmm4
jmp .L005square
.L006square_break:
psllq $32,%xmm0
psllq $32,%xmm1
psllq $32,%xmm2
psllq $32,%xmm3
psllq $32,%xmm4
por (%esp),%xmm0
por 16(%esp),%xmm1
por 32(%esp),%xmm2
por 48(%esp),%xmm3
por 64(%esp),%xmm4
pshufd $141,%xmm0,%xmm0
pshufd $141,%xmm1,%xmm1
pshufd $141,%xmm2,%xmm2
pshufd $141,%xmm3,%xmm3
pshufd $141,%xmm4,%xmm4
movdqu %xmm0,(%edi)
movdqu %xmm1,16(%edi)
movdqu %xmm2,32(%edi)
movdqu %xmm3,48(%edi)
movdqu %xmm4,64(%edi)
movdqa %xmm1,%xmm6
movdqa %xmm2,%xmm5
pslld $2,%xmm6
pslld $2,%xmm5
paddd %xmm1,%xmm6
paddd %xmm2,%xmm5
movdqu %xmm6,80(%edi)
movdqu %xmm5,96(%edi)
movdqa %xmm3,%xmm6
movdqa %xmm4,%xmm5
pslld $2,%xmm6
pslld $2,%xmm5
paddd %xmm3,%xmm6
paddd %xmm4,%xmm5
movdqu %xmm6,112(%edi)
movdqu %xmm5,128(%edi)
movl %ebp,%esp
leal -48(%edi),%edi
ret
.size _poly1305_init_sse2,.-_poly1305_init_sse2
.align 32
.type _poly1305_blocks_sse2,@function
.align 16
_poly1305_blocks_sse2:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ecx
movl 20(%edi),%eax
andl $-16,%ecx
jz .L007nodata
cmpl $64,%ecx
jae .L008enter_sse2
testl %eax,%eax
jz .Lenter_blocks
.align 16
.L008enter_sse2:
call .L009pic_point
.L009pic_point:
popl %ebx
leal .Lconst_sse2-.L009pic_point(%ebx),%ebx
testl %eax,%eax
jnz .L010base2_26
call _poly1305_init_sse2
movl (%edi),%eax
movl 3(%edi),%ecx
movl 6(%edi),%edx
movl 9(%edi),%esi
movl 13(%edi),%ebp
movl $1,20(%edi)
shrl $2,%ecx
andl $67108863,%eax
shrl $4,%edx
andl $67108863,%ecx
shrl $6,%esi
andl $67108863,%edx
movd %eax,%xmm0
movd %ecx,%xmm1
movd %edx,%xmm2
movd %esi,%xmm3
movd %ebp,%xmm4
movl 24(%esp),%esi
movl 28(%esp),%ecx
jmp .L011base2_32
.align 16
.L010base2_26:
movd (%edi),%xmm0
movd 4(%edi),%xmm1
movd 8(%edi),%xmm2
movd 12(%edi),%xmm3
movd 16(%edi),%xmm4
movdqa 64(%ebx),%xmm7
.L011base2_32:
movl 32(%esp),%eax
movl %esp,%ebp
subl $528,%esp
andl $-16,%esp
leal 48(%edi),%edi
shll $24,%eax
testl $31,%ecx
jz .L012even
movdqu (%esi),%xmm6
leal 16(%esi),%esi
movdqa %xmm6,%xmm5
pand %xmm7,%xmm6
paddd %xmm6,%xmm0
movdqa %xmm5,%xmm6
psrlq $26,%xmm5
psrldq $6,%xmm6
pand %xmm7,%xmm5
paddd %xmm5,%xmm1
movdqa %xmm6,%xmm5
psrlq $4,%xmm6
pand %xmm7,%xmm6
paddd %xmm6,%xmm2
movdqa %xmm5,%xmm6
psrlq $30,%xmm5
pand %xmm7,%xmm5
psrldq $7,%xmm6
paddd %xmm5,%xmm3
movd %eax,%xmm5
paddd %xmm6,%xmm4
movd 12(%edi),%xmm6
paddd %xmm5,%xmm4
movdqa %xmm0,(%esp)
movdqa %xmm1,16(%esp)
movdqa %xmm2,32(%esp)
movdqa %xmm3,48(%esp)
movdqa %xmm4,64(%esp)
pmuludq %xmm6,%xmm0
pmuludq %xmm6,%xmm1
pmuludq %xmm6,%xmm2
movd 28(%edi),%xmm5
pmuludq %xmm6,%xmm3
pmuludq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 48(%esp),%xmm5
movdqa %xmm6,%xmm7
pmuludq 32(%esp),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%esp),%xmm7
paddq %xmm6,%xmm3
movd 92(%edi),%xmm6
pmuludq (%esp),%xmm5
paddq %xmm7,%xmm2
pmuludq 64(%esp),%xmm6
movd 44(%edi),%xmm7
paddq %xmm5,%xmm1
movdqa %xmm7,%xmm5
pmuludq 32(%esp),%xmm7
paddq %xmm6,%xmm0
movdqa %xmm5,%xmm6
pmuludq 16(%esp),%xmm5
paddq %xmm7,%xmm4
movd 108(%edi),%xmm7
pmuludq (%esp),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq 64(%esp),%xmm7
paddq %xmm6,%xmm2
pmuludq 48(%esp),%xmm5
movd 60(%edi),%xmm6
paddq %xmm7,%xmm1
movdqa %xmm6,%xmm7
pmuludq 16(%esp),%xmm6
paddq %xmm5,%xmm0
movd 124(%edi),%xmm5
pmuludq (%esp),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 64(%esp),%xmm5
paddq %xmm7,%xmm3
movdqa %xmm6,%xmm7
pmuludq 48(%esp),%xmm6
paddq %xmm5,%xmm2
pmuludq 32(%esp),%xmm7
movd 76(%edi),%xmm5
paddq %xmm6,%xmm1
movd 140(%edi),%xmm6
pmuludq (%esp),%xmm5
paddq %xmm7,%xmm0
movdqa %xmm6,%xmm7
pmuludq 64(%esp),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%esp),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq 32(%esp),%xmm5
paddq %xmm7,%xmm0
pmuludq 48(%esp),%xmm6
movdqa 64(%ebx),%xmm7
paddq %xmm5,%xmm1
paddq %xmm6,%xmm2
movdqa %xmm3,%xmm5
pand %xmm7,%xmm3
psrlq $26,%xmm5
paddq %xmm4,%xmm5
movdqa %xmm0,%xmm6
pand %xmm7,%xmm0
psrlq $26,%xmm6
movdqa %xmm5,%xmm4
paddq %xmm1,%xmm6
psrlq $26,%xmm5
pand %xmm7,%xmm4
movdqa %xmm6,%xmm1
psrlq $26,%xmm6
paddd %xmm5,%xmm0
psllq $2,%xmm5
paddq %xmm2,%xmm6
paddq %xmm0,%xmm5
pand %xmm7,%xmm1
movdqa %xmm6,%xmm2
psrlq $26,%xmm6
pand %xmm7,%xmm2
paddd %xmm3,%xmm6
movdqa %xmm5,%xmm0
psrlq $26,%xmm5
movdqa %xmm6,%xmm3
psrlq $26,%xmm6
pand %xmm7,%xmm0
paddd %xmm5,%xmm1
pand %xmm7,%xmm3
paddd %xmm6,%xmm4
subl $16,%ecx
jz .L013done
.L012even:
leal 384(%esp),%edx
leal -32(%esi),%eax
subl $64,%ecx
movdqu (%edi),%xmm5
pshufd $68,%xmm5,%xmm6
cmovbl %eax,%esi
pshufd $238,%xmm5,%xmm5
movdqa %xmm6,(%edx)
leal 160(%esp),%eax
movdqu 16(%edi),%xmm6
movdqa %xmm5,-144(%edx)
pshufd $68,%xmm6,%xmm5
pshufd $238,%xmm6,%xmm6
movdqa %xmm5,16(%edx)
movdqu 32(%edi),%xmm5
movdqa %xmm6,-128(%edx)
pshufd $68,%xmm5,%xmm6
pshufd $238,%xmm5,%xmm5
movdqa %xmm6,32(%edx)
movdqu 48(%edi),%xmm6
movdqa %xmm5,-112(%edx)
pshufd $68,%xmm6,%xmm5
pshufd $238,%xmm6,%xmm6
movdqa %xmm5,48(%edx)
movdqu 64(%edi),%xmm5
movdqa %xmm6,-96(%edx)
pshufd $68,%xmm5,%xmm6
pshufd $238,%xmm5,%xmm5
movdqa %xmm6,64(%edx)
movdqu 80(%edi),%xmm6
movdqa %xmm5,-80(%edx)
pshufd $68,%xmm6,%xmm5
pshufd $238,%xmm6,%xmm6
movdqa %xmm5,80(%edx)
movdqu 96(%edi),%xmm5
movdqa %xmm6,-64(%edx)
pshufd $68,%xmm5,%xmm6
pshufd $238,%xmm5,%xmm5
movdqa %xmm6,96(%edx)
movdqu 112(%edi),%xmm6
movdqa %xmm5,-48(%edx)
pshufd $68,%xmm6,%xmm5
pshufd $238,%xmm6,%xmm6
movdqa %xmm5,112(%edx)
movdqu 128(%edi),%xmm5
movdqa %xmm6,-32(%edx)
pshufd $68,%xmm5,%xmm6
pshufd $238,%xmm5,%xmm5
movdqa %xmm6,128(%edx)
movdqa %xmm5,-16(%edx)
movdqu 32(%esi),%xmm5
movdqu 48(%esi),%xmm6
leal 32(%esi),%esi
movdqa %xmm2,112(%esp)
movdqa %xmm3,128(%esp)
movdqa %xmm4,144(%esp)
movdqa %xmm5,%xmm2
movdqa %xmm6,%xmm3
psrldq $6,%xmm2
psrldq $6,%xmm3
movdqa %xmm5,%xmm4
punpcklqdq %xmm3,%xmm2
punpckhqdq %xmm6,%xmm4
punpcklqdq %xmm6,%xmm5
movdqa %xmm2,%xmm3
psrlq $4,%xmm2
psrlq $30,%xmm3
movdqa %xmm5,%xmm6
psrlq $40,%xmm4
psrlq $26,%xmm6
pand %xmm7,%xmm5
pand %xmm7,%xmm6
pand %xmm7,%xmm2
pand %xmm7,%xmm3
por (%ebx),%xmm4
movdqa %xmm0,80(%esp)
movdqa %xmm1,96(%esp)
jbe .L014skip_loop
jmp .L015loop
.align 32
.L015loop:
movdqa -144(%edx),%xmm7
movdqa %xmm6,16(%eax)
movdqa %xmm2,32(%eax)
movdqa %xmm3,48(%eax)
movdqa %xmm4,64(%eax)
movdqa %xmm5,%xmm1
pmuludq %xmm7,%xmm5
movdqa %xmm6,%xmm0
pmuludq %xmm7,%xmm6
pmuludq %xmm7,%xmm2
pmuludq %xmm7,%xmm3
pmuludq %xmm7,%xmm4
pmuludq -16(%edx),%xmm0
movdqa %xmm1,%xmm7
pmuludq -128(%edx),%xmm1
paddq %xmm5,%xmm0
movdqa %xmm7,%xmm5
pmuludq -112(%edx),%xmm7
paddq %xmm6,%xmm1
movdqa %xmm5,%xmm6
pmuludq -96(%edx),%xmm5
paddq %xmm7,%xmm2
movdqa 16(%eax),%xmm7
pmuludq -80(%edx),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq -128(%edx),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq -112(%edx),%xmm5
paddq %xmm7,%xmm2
movdqa 32(%eax),%xmm7
pmuludq -96(%edx),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq -32(%edx),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq -16(%edx),%xmm5
paddq %xmm7,%xmm0
movdqa %xmm6,%xmm7
pmuludq -128(%edx),%xmm6
paddq %xmm5,%xmm1
movdqa 48(%eax),%xmm5
pmuludq -112(%edx),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq -48(%edx),%xmm5
paddq %xmm7,%xmm4
movdqa %xmm6,%xmm7
pmuludq -32(%edx),%xmm6
paddq %xmm5,%xmm0
movdqa %xmm7,%xmm5
pmuludq -16(%edx),%xmm7
paddq %xmm6,%xmm1
movdqa 64(%eax),%xmm6
pmuludq -128(%edx),%xmm5
paddq %xmm7,%xmm2
movdqa %xmm6,%xmm7
pmuludq -16(%edx),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq -64(%edx),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq -48(%edx),%xmm5
paddq %xmm7,%xmm0
movdqa 64(%ebx),%xmm7
pmuludq -32(%edx),%xmm6
paddq %xmm5,%xmm1
paddq %xmm6,%xmm2
movdqu -32(%esi),%xmm5
movdqu -16(%esi),%xmm6
leal 32(%esi),%esi
movdqa %xmm2,32(%esp)
movdqa %xmm3,48(%esp)
movdqa %xmm4,64(%esp)
movdqa %xmm5,%xmm2
movdqa %xmm6,%xmm3
psrldq $6,%xmm2
psrldq $6,%xmm3
movdqa %xmm5,%xmm4
punpcklqdq %xmm3,%xmm2
punpckhqdq %xmm6,%xmm4
punpcklqdq %xmm6,%xmm5
movdqa %xmm2,%xmm3
psrlq $4,%xmm2
psrlq $30,%xmm3
movdqa %xmm5,%xmm6
psrlq $40,%xmm4
psrlq $26,%xmm6
pand %xmm7,%xmm5
pand %xmm7,%xmm6
pand %xmm7,%xmm2
pand %xmm7,%xmm3
por (%ebx),%xmm4
leal -32(%esi),%eax
subl $64,%ecx
paddd 80(%esp),%xmm5
paddd 96(%esp),%xmm6
paddd 112(%esp),%xmm2
paddd 128(%esp),%xmm3
paddd 144(%esp),%xmm4
cmovbl %eax,%esi
leal 160(%esp),%eax
movdqa (%edx),%xmm7
movdqa %xmm1,16(%esp)
movdqa %xmm6,16(%eax)
movdqa %xmm2,32(%eax)
movdqa %xmm3,48(%eax)
movdqa %xmm4,64(%eax)
movdqa %xmm5,%xmm1
pmuludq %xmm7,%xmm5
paddq %xmm0,%xmm5
movdqa %xmm6,%xmm0
pmuludq %xmm7,%xmm6
pmuludq %xmm7,%xmm2
pmuludq %xmm7,%xmm3
pmuludq %xmm7,%xmm4
paddq 16(%esp),%xmm6
paddq 32(%esp),%xmm2
paddq 48(%esp),%xmm3
paddq 64(%esp),%xmm4
pmuludq 128(%edx),%xmm0
movdqa %xmm1,%xmm7
pmuludq 16(%edx),%xmm1
paddq %xmm5,%xmm0
movdqa %xmm7,%xmm5
pmuludq 32(%edx),%xmm7
paddq %xmm6,%xmm1
movdqa %xmm5,%xmm6
pmuludq 48(%edx),%xmm5
paddq %xmm7,%xmm2
movdqa 16(%eax),%xmm7
pmuludq 64(%edx),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq 16(%edx),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 32(%edx),%xmm5
paddq %xmm7,%xmm2
movdqa 32(%eax),%xmm7
pmuludq 48(%edx),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq 112(%edx),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 128(%edx),%xmm5
paddq %xmm7,%xmm0
movdqa %xmm6,%xmm7
pmuludq 16(%edx),%xmm6
paddq %xmm5,%xmm1
movdqa 48(%eax),%xmm5
pmuludq 32(%edx),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq 96(%edx),%xmm5
paddq %xmm7,%xmm4
movdqa %xmm6,%xmm7
pmuludq 112(%edx),%xmm6
paddq %xmm5,%xmm0
movdqa %xmm7,%xmm5
pmuludq 128(%edx),%xmm7
paddq %xmm6,%xmm1
movdqa 64(%eax),%xmm6
pmuludq 16(%edx),%xmm5
paddq %xmm7,%xmm2
movdqa %xmm6,%xmm7
pmuludq 128(%edx),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 80(%edx),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq 96(%edx),%xmm5
paddq %xmm7,%xmm0
movdqa 64(%ebx),%xmm7
pmuludq 112(%edx),%xmm6
paddq %xmm5,%xmm1
paddq %xmm6,%xmm2
movdqa %xmm3,%xmm5
pand %xmm7,%xmm3
psrlq $26,%xmm5
paddq %xmm4,%xmm5
movdqa %xmm0,%xmm6
pand %xmm7,%xmm0
psrlq $26,%xmm6
movdqa %xmm5,%xmm4
paddq %xmm1,%xmm6
psrlq $26,%xmm5
pand %xmm7,%xmm4
movdqa %xmm6,%xmm1
psrlq $26,%xmm6
paddd %xmm5,%xmm0
psllq $2,%xmm5
paddq %xmm2,%xmm6
paddq %xmm0,%xmm5
pand %xmm7,%xmm1
movdqa %xmm6,%xmm2
psrlq $26,%xmm6
pand %xmm7,%xmm2
paddd %xmm3,%xmm6
movdqa %xmm5,%xmm0
psrlq $26,%xmm5
movdqa %xmm6,%xmm3
psrlq $26,%xmm6
pand %xmm7,%xmm0
paddd %xmm5,%xmm1
pand %xmm7,%xmm3
paddd %xmm6,%xmm4
movdqu 32(%esi),%xmm5
movdqu 48(%esi),%xmm6
leal 32(%esi),%esi
movdqa %xmm2,112(%esp)
movdqa %xmm3,128(%esp)
movdqa %xmm4,144(%esp)
movdqa %xmm5,%xmm2
movdqa %xmm6,%xmm3
psrldq $6,%xmm2
psrldq $6,%xmm3
movdqa %xmm5,%xmm4
punpcklqdq %xmm3,%xmm2
punpckhqdq %xmm6,%xmm4
punpcklqdq %xmm6,%xmm5
movdqa %xmm2,%xmm3
psrlq $4,%xmm2
psrlq $30,%xmm3
movdqa %xmm5,%xmm6
psrlq $40,%xmm4
psrlq $26,%xmm6
pand %xmm7,%xmm5
pand %xmm7,%xmm6
pand %xmm7,%xmm2
pand %xmm7,%xmm3
por (%ebx),%xmm4
movdqa %xmm0,80(%esp)
movdqa %xmm1,96(%esp)
ja .L015loop
.L014skip_loop:
pshufd $16,-144(%edx),%xmm7
addl $32,%ecx
jnz .L016long_tail
paddd %xmm0,%xmm5
paddd %xmm1,%xmm6
paddd 112(%esp),%xmm2
paddd 128(%esp),%xmm3
paddd 144(%esp),%xmm4
.L016long_tail:
movdqa %xmm5,(%eax)
movdqa %xmm6,16(%eax)
movdqa %xmm2,32(%eax)
movdqa %xmm3,48(%eax)
movdqa %xmm4,64(%eax)
pmuludq %xmm7,%xmm5
pmuludq %xmm7,%xmm6
pmuludq %xmm7,%xmm2
movdqa %xmm5,%xmm0
pshufd $16,-128(%edx),%xmm5
pmuludq %xmm7,%xmm3
movdqa %xmm6,%xmm1
pmuludq %xmm7,%xmm4
movdqa %xmm5,%xmm6
pmuludq 48(%eax),%xmm5
movdqa %xmm6,%xmm7
pmuludq 32(%eax),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%eax),%xmm7
paddq %xmm6,%xmm3
pshufd $16,-64(%edx),%xmm6
pmuludq (%eax),%xmm5
paddq %xmm7,%xmm2
pmuludq 64(%eax),%xmm6
pshufd $16,-112(%edx),%xmm7
paddq %xmm5,%xmm1
movdqa %xmm7,%xmm5
pmuludq 32(%eax),%xmm7
paddq %xmm6,%xmm0
movdqa %xmm5,%xmm6
pmuludq 16(%eax),%xmm5
paddq %xmm7,%xmm4
pshufd $16,-48(%edx),%xmm7
pmuludq (%eax),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq 64(%eax),%xmm7
paddq %xmm6,%xmm2
pmuludq 48(%eax),%xmm5
pshufd $16,-96(%edx),%xmm6
paddq %xmm7,%xmm1
movdqa %xmm6,%xmm7
pmuludq 16(%eax),%xmm6
paddq %xmm5,%xmm0
pshufd $16,-32(%edx),%xmm5
pmuludq (%eax),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 64(%eax),%xmm5
paddq %xmm7,%xmm3
movdqa %xmm6,%xmm7
pmuludq 48(%eax),%xmm6
paddq %xmm5,%xmm2
pmuludq 32(%eax),%xmm7
pshufd $16,-80(%edx),%xmm5
paddq %xmm6,%xmm1
pshufd $16,-16(%edx),%xmm6
pmuludq (%eax),%xmm5
paddq %xmm7,%xmm0
movdqa %xmm6,%xmm7
pmuludq 64(%eax),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%eax),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq 32(%eax),%xmm5
paddq %xmm7,%xmm0
pmuludq 48(%eax),%xmm6
movdqa 64(%ebx),%xmm7
paddq %xmm5,%xmm1
paddq %xmm6,%xmm2
jz .L017short_tail
movdqu -32(%esi),%xmm5
movdqu -16(%esi),%xmm6
leal 32(%esi),%esi
movdqa %xmm2,32(%esp)
movdqa %xmm3,48(%esp)
movdqa %xmm4,64(%esp)
movdqa %xmm5,%xmm2
movdqa %xmm6,%xmm3
psrldq $6,%xmm2
psrldq $6,%xmm3
movdqa %xmm5,%xmm4
punpcklqdq %xmm3,%xmm2
punpckhqdq %xmm6,%xmm4
punpcklqdq %xmm6,%xmm5
movdqa %xmm2,%xmm3
psrlq $4,%xmm2
psrlq $30,%xmm3
movdqa %xmm5,%xmm6
psrlq $40,%xmm4
psrlq $26,%xmm6
pand %xmm7,%xmm5
pand %xmm7,%xmm6
pand %xmm7,%xmm2
pand %xmm7,%xmm3
por (%ebx),%xmm4
pshufd $16,(%edx),%xmm7
paddd 80(%esp),%xmm5
paddd 96(%esp),%xmm6
paddd 112(%esp),%xmm2
paddd 128(%esp),%xmm3
paddd 144(%esp),%xmm4
movdqa %xmm5,(%esp)
pmuludq %xmm7,%xmm5
movdqa %xmm6,16(%esp)
pmuludq %xmm7,%xmm6
paddq %xmm5,%xmm0
movdqa %xmm2,%xmm5
pmuludq %xmm7,%xmm2
paddq %xmm6,%xmm1
movdqa %xmm3,%xmm6
pmuludq %xmm7,%xmm3
paddq 32(%esp),%xmm2
movdqa %xmm5,32(%esp)
pshufd $16,16(%edx),%xmm5
paddq 48(%esp),%xmm3
movdqa %xmm6,48(%esp)
movdqa %xmm4,%xmm6
pmuludq %xmm7,%xmm4
paddq 64(%esp),%xmm4
movdqa %xmm6,64(%esp)
movdqa %xmm5,%xmm6
pmuludq 48(%esp),%xmm5
movdqa %xmm6,%xmm7
pmuludq 32(%esp),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%esp),%xmm7
paddq %xmm6,%xmm3
pshufd $16,80(%edx),%xmm6
pmuludq (%esp),%xmm5
paddq %xmm7,%xmm2
pmuludq 64(%esp),%xmm6
pshufd $16,32(%edx),%xmm7
paddq %xmm5,%xmm1
movdqa %xmm7,%xmm5
pmuludq 32(%esp),%xmm7
paddq %xmm6,%xmm0
movdqa %xmm5,%xmm6
pmuludq 16(%esp),%xmm5
paddq %xmm7,%xmm4
pshufd $16,96(%edx),%xmm7
pmuludq (%esp),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq 64(%esp),%xmm7
paddq %xmm6,%xmm2
pmuludq 48(%esp),%xmm5
pshufd $16,48(%edx),%xmm6
paddq %xmm7,%xmm1
movdqa %xmm6,%xmm7
pmuludq 16(%esp),%xmm6
paddq %xmm5,%xmm0
pshufd $16,112(%edx),%xmm5
pmuludq (%esp),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 64(%esp),%xmm5
paddq %xmm7,%xmm3
movdqa %xmm6,%xmm7
pmuludq 48(%esp),%xmm6
paddq %xmm5,%xmm2
pmuludq 32(%esp),%xmm7
pshufd $16,64(%edx),%xmm5
paddq %xmm6,%xmm1
pshufd $16,128(%edx),%xmm6
pmuludq (%esp),%xmm5
paddq %xmm7,%xmm0
movdqa %xmm6,%xmm7
pmuludq 64(%esp),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%esp),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq 32(%esp),%xmm5
paddq %xmm7,%xmm0
pmuludq 48(%esp),%xmm6
movdqa 64(%ebx),%xmm7
paddq %xmm5,%xmm1
paddq %xmm6,%xmm2
.L017short_tail:
pshufd $78,%xmm4,%xmm6
pshufd $78,%xmm3,%xmm5
paddq %xmm6,%xmm4
paddq %xmm5,%xmm3
pshufd $78,%xmm0,%xmm6
pshufd $78,%xmm1,%xmm5
paddq %xmm6,%xmm0
paddq %xmm5,%xmm1
pshufd $78,%xmm2,%xmm6
movdqa %xmm3,%xmm5
pand %xmm7,%xmm3
psrlq $26,%xmm5
paddq %xmm6,%xmm2
paddq %xmm4,%xmm5
movdqa %xmm0,%xmm6
pand %xmm7,%xmm0
psrlq $26,%xmm6
movdqa %xmm5,%xmm4
paddq %xmm1,%xmm6
psrlq $26,%xmm5
pand %xmm7,%xmm4
movdqa %xmm6,%xmm1
psrlq $26,%xmm6
paddd %xmm5,%xmm0
psllq $2,%xmm5
paddq %xmm2,%xmm6
paddq %xmm0,%xmm5
pand %xmm7,%xmm1
movdqa %xmm6,%xmm2
psrlq $26,%xmm6
pand %xmm7,%xmm2
paddd %xmm3,%xmm6
movdqa %xmm5,%xmm0
psrlq $26,%xmm5
movdqa %xmm6,%xmm3
psrlq $26,%xmm6
pand %xmm7,%xmm0
paddd %xmm5,%xmm1
pand %xmm7,%xmm3
paddd %xmm6,%xmm4
.L013done:
movd %xmm0,-48(%edi)
movd %xmm1,-44(%edi)
movd %xmm2,-40(%edi)
movd %xmm3,-36(%edi)
movd %xmm4,-32(%edi)
movl %ebp,%esp
.L007nodata:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size _poly1305_blocks_sse2,.-_poly1305_blocks_sse2
.align 32
.type _poly1305_emit_sse2,@function
.align 16
_poly1305_emit_sse2:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%ebp
cmpl $0,20(%ebp)
je .Lenter_emit
movl (%ebp),%eax
movl 4(%ebp),%edi
movl 8(%ebp),%ecx
movl 12(%ebp),%edx
movl 16(%ebp),%esi
movl %edi,%ebx
shll $26,%edi
shrl $6,%ebx
addl %edi,%eax
movl %ecx,%edi
adcl $0,%ebx
shll $20,%edi
shrl $12,%ecx
addl %edi,%ebx
movl %edx,%edi
adcl $0,%ecx
shll $14,%edi
shrl $18,%edx
addl %edi,%ecx
movl %esi,%edi
adcl $0,%edx
shll $8,%edi
shrl $24,%esi
addl %edi,%edx
adcl $0,%esi
movl %esi,%edi
andl $3,%esi
shrl $2,%edi
leal (%edi,%edi,4),%ebp
movl 24(%esp),%edi
addl %ebp,%eax
movl 28(%esp),%ebp
adcl $0,%ebx
adcl $0,%ecx
adcl $0,%edx
adcl $0,%esi
movd %eax,%xmm0
addl $5,%eax
movd %ebx,%xmm1
adcl $0,%ebx
movd %ecx,%xmm2
adcl $0,%ecx
movd %edx,%xmm3
adcl $0,%edx
adcl $0,%esi
shrl $2,%esi
negl %esi
andl %esi,%eax
andl %esi,%ebx
andl %esi,%ecx
andl %esi,%edx
movl %eax,(%edi)
movd %xmm0,%eax
movl %ebx,4(%edi)
movd %xmm1,%ebx
movl %ecx,8(%edi)
movd %xmm2,%ecx
movl %edx,12(%edi)
movd %xmm3,%edx
notl %esi
andl %esi,%eax
andl %esi,%ebx
orl (%edi),%eax
andl %esi,%ecx
orl 4(%edi),%ebx
andl %esi,%edx
orl 8(%edi),%ecx
orl 12(%edi),%edx
addl (%ebp),%eax
adcl 4(%ebp),%ebx
movl %eax,(%edi)
adcl 8(%ebp),%ecx
movl %ebx,4(%edi)
adcl 12(%ebp),%edx
movl %ecx,8(%edi)
movl %edx,12(%edi)
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size _poly1305_emit_sse2,.-_poly1305_emit_sse2
.align 32
.type _poly1305_init_avx2,@function
.align 16
_poly1305_init_avx2:
vmovdqu 24(%edi),%xmm4
leal 48(%edi),%edi
movl %esp,%ebp
subl $224,%esp
andl $-16,%esp
vmovdqa 64(%ebx),%xmm7
vpand %xmm7,%xmm4,%xmm0
vpsrlq $26,%xmm4,%xmm1
vpsrldq $6,%xmm4,%xmm3
vpand %xmm7,%xmm1,%xmm1
vpsrlq $4,%xmm3,%xmm2
vpsrlq $30,%xmm3,%xmm3
vpand %xmm7,%xmm2,%xmm2
vpand %xmm7,%xmm3,%xmm3
vpsrldq $13,%xmm4,%xmm4
leal 144(%esp),%edx
movl $2,%ecx
.L018square:
vmovdqa %xmm0,(%esp)
vmovdqa %xmm1,16(%esp)
vmovdqa %xmm2,32(%esp)
vmovdqa %xmm3,48(%esp)
vmovdqa %xmm4,64(%esp)
vpslld $2,%xmm1,%xmm6
vpslld $2,%xmm2,%xmm5
vpaddd %xmm1,%xmm6,%xmm6
vpaddd %xmm2,%xmm5,%xmm5
vmovdqa %xmm6,80(%esp)
vmovdqa %xmm5,96(%esp)
vpslld $2,%xmm3,%xmm6
vpslld $2,%xmm4,%xmm5
vpaddd %xmm3,%xmm6,%xmm6
vpaddd %xmm4,%xmm5,%xmm5
vmovdqa %xmm6,112(%esp)
vmovdqa %xmm5,128(%esp)
vpshufd $68,%xmm0,%xmm5
vmovdqa %xmm1,%xmm6
vpshufd $68,%xmm1,%xmm1
vpshufd $68,%xmm2,%xmm2
vpshufd $68,%xmm3,%xmm3
vpshufd $68,%xmm4,%xmm4
vmovdqa %xmm5,(%edx)
vmovdqa %xmm1,16(%edx)
vmovdqa %xmm2,32(%edx)
vmovdqa %xmm3,48(%edx)
vmovdqa %xmm4,64(%edx)
vpmuludq %xmm0,%xmm4,%xmm4
vpmuludq %xmm0,%xmm3,%xmm3
vpmuludq %xmm0,%xmm2,%xmm2
vpmuludq %xmm0,%xmm1,%xmm1
vpmuludq %xmm0,%xmm5,%xmm0
vpmuludq 48(%edx),%xmm6,%xmm5
vpaddq %xmm5,%xmm4,%xmm4
vpmuludq 32(%edx),%xmm6,%xmm7
vpaddq %xmm7,%xmm3,%xmm3
vpmuludq 16(%edx),%xmm6,%xmm5
vpaddq %xmm5,%xmm2,%xmm2
vmovdqa 80(%esp),%xmm7
vpmuludq (%edx),%xmm6,%xmm6
vpaddq %xmm6,%xmm1,%xmm1
vmovdqa 32(%esp),%xmm5
vpmuludq 64(%edx),%xmm7,%xmm7
vpaddq %xmm7,%xmm0,%xmm0
vpmuludq 32(%edx),%xmm5,%xmm6
vpaddq %xmm6,%xmm4,%xmm4
vpmuludq 16(%edx),%xmm5,%xmm7
vpaddq %xmm7,%xmm3,%xmm3
vmovdqa 96(%esp),%xmm6
vpmuludq (%edx),%xmm5,%xmm5
vpaddq %xmm5,%xmm2,%xmm2
vpmuludq 64(%edx),%xmm6,%xmm7
vpaddq %xmm7,%xmm1,%xmm1
vmovdqa 48(%esp),%xmm5
vpmuludq 48(%edx),%xmm6,%xmm6
vpaddq %xmm6,%xmm0,%xmm0
vpmuludq 16(%edx),%xmm5,%xmm7
vpaddq %xmm7,%xmm4,%xmm4
vmovdqa 112(%esp),%xmm6
vpmuludq (%edx),%xmm5,%xmm5
vpaddq %xmm5,%xmm3,%xmm3
vpmuludq 64(%edx),%xmm6,%xmm7
vpaddq %xmm7,%xmm2,%xmm2
vpmuludq 48(%edx),%xmm6,%xmm5
vpaddq %xmm5,%xmm1,%xmm1
vmovdqa 64(%esp),%xmm7
vpmuludq 32(%edx),%xmm6,%xmm6
vpaddq %xmm6,%xmm0,%xmm0
vmovdqa 128(%esp),%xmm5
vpmuludq (%edx),%xmm7,%xmm7
vpaddq %xmm7,%xmm4,%xmm4
vpmuludq 64(%edx),%xmm5,%xmm6
vpaddq %xmm6,%xmm3,%xmm3
vpmuludq 16(%edx),%xmm5,%xmm7
vpaddq %xmm7,%xmm0,%xmm0
vpmuludq 32(%edx),%xmm5,%xmm6
vpaddq %xmm6,%xmm1,%xmm1
vmovdqa 64(%ebx),%xmm7
vpmuludq 48(%edx),%xmm5,%xmm5
vpaddq %xmm5,%xmm2,%xmm2
vpsrlq $26,%xmm3,%xmm5
vpand %xmm7,%xmm3,%xmm3
vpsrlq $26,%xmm0,%xmm6
vpand %xmm7,%xmm0,%xmm0
vpaddq %xmm5,%xmm4,%xmm4
vpaddq %xmm6,%xmm1,%xmm1
vpsrlq $26,%xmm4,%xmm5
vpand %xmm7,%xmm4,%xmm4
vpsrlq $26,%xmm1,%xmm6
vpand %xmm7,%xmm1,%xmm1
vpaddq %xmm6,%xmm2,%xmm2
vpaddd %xmm5,%xmm0,%xmm0
vpsllq $2,%xmm5,%xmm5
vpsrlq $26,%xmm2,%xmm6
vpand %xmm7,%xmm2,%xmm2
vpaddd %xmm5,%xmm0,%xmm0
vpaddd %xmm6,%xmm3,%xmm3
vpsrlq $26,%xmm3,%xmm6
vpsrlq $26,%xmm0,%xmm5
vpand %xmm7,%xmm0,%xmm0
vpand %xmm7,%xmm3,%xmm3
vpaddd %xmm5,%xmm1,%xmm1
vpaddd %xmm6,%xmm4,%xmm4
decl %ecx
jz .L019square_break
vpunpcklqdq (%esp),%xmm0,%xmm0
vpunpcklqdq 16(%esp),%xmm1,%xmm1
vpunpcklqdq 32(%esp),%xmm2,%xmm2
vpunpcklqdq 48(%esp),%xmm3,%xmm3
vpunpcklqdq 64(%esp),%xmm4,%xmm4
jmp .L018square
.L019square_break:
vpsllq $32,%xmm0,%xmm0
vpsllq $32,%xmm1,%xmm1
vpsllq $32,%xmm2,%xmm2
vpsllq $32,%xmm3,%xmm3
vpsllq $32,%xmm4,%xmm4
vpor (%esp),%xmm0,%xmm0
vpor 16(%esp),%xmm1,%xmm1
vpor 32(%esp),%xmm2,%xmm2
vpor 48(%esp),%xmm3,%xmm3
vpor 64(%esp),%xmm4,%xmm4
vpshufd $141,%xmm0,%xmm0
vpshufd $141,%xmm1,%xmm1
vpshufd $141,%xmm2,%xmm2
vpshufd $141,%xmm3,%xmm3
vpshufd $141,%xmm4,%xmm4
vmovdqu %xmm0,(%edi)
vmovdqu %xmm1,16(%edi)
vmovdqu %xmm2,32(%edi)
vmovdqu %xmm3,48(%edi)
vmovdqu %xmm4,64(%edi)
vpslld $2,%xmm1,%xmm6
vpslld $2,%xmm2,%xmm5
vpaddd %xmm1,%xmm6,%xmm6
vpaddd %xmm2,%xmm5,%xmm5
vmovdqu %xmm6,80(%edi)
vmovdqu %xmm5,96(%edi)
vpslld $2,%xmm3,%xmm6
vpslld $2,%xmm4,%xmm5
vpaddd %xmm3,%xmm6,%xmm6
vpaddd %xmm4,%xmm5,%xmm5
vmovdqu %xmm6,112(%edi)
vmovdqu %xmm5,128(%edi)
movl %ebp,%esp
leal -48(%edi),%edi
ret
.size _poly1305_init_avx2,.-_poly1305_init_avx2
.align 32
.type _poly1305_blocks_avx2,@function
.align 16
_poly1305_blocks_avx2:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ecx
movl 20(%edi),%eax
andl $-16,%ecx
jz .L020nodata
cmpl $64,%ecx
jae .L021enter_avx2
testl %eax,%eax
jz .Lenter_blocks
.L021enter_avx2:
vzeroupper
call .L022pic_point
.L022pic_point:
popl %ebx
leal .Lconst_sse2-.L022pic_point(%ebx),%ebx
testl %eax,%eax
jnz .L023base2_26
call _poly1305_init_avx2
movl (%edi),%eax
movl 3(%edi),%ecx
movl 6(%edi),%edx
movl 9(%edi),%esi
movl 13(%edi),%ebp
shrl $2,%ecx
andl $67108863,%eax
shrl $4,%edx
andl $67108863,%ecx
shrl $6,%esi
andl $67108863,%edx
movl %eax,(%edi)
movl %ecx,4(%edi)
movl %edx,8(%edi)
movl %esi,12(%edi)
movl %ebp,16(%edi)
movl $1,20(%edi)
movl 24(%esp),%esi
movl 28(%esp),%ecx
.L023base2_26:
movl 32(%esp),%eax
movl %esp,%ebp
subl $448,%esp
andl $-512,%esp
vmovdqu 48(%edi),%xmm0
leal 288(%esp),%edx
vmovdqu 64(%edi),%xmm1
vmovdqu 80(%edi),%xmm2
vmovdqu 96(%edi),%xmm3
vmovdqu 112(%edi),%xmm4
leal 48(%edi),%edi
vpermq $64,%ymm0,%ymm0
vpermq $64,%ymm1,%ymm1
vpermq $64,%ymm2,%ymm2
vpermq $64,%ymm3,%ymm3
vpermq $64,%ymm4,%ymm4
vpshufd $200,%ymm0,%ymm0
vpshufd $200,%ymm1,%ymm1
vpshufd $200,%ymm2,%ymm2
vpshufd $200,%ymm3,%ymm3
vpshufd $200,%ymm4,%ymm4
vmovdqa %ymm0,-128(%edx)
vmovdqu 80(%edi),%xmm0
vmovdqa %ymm1,-96(%edx)
vmovdqu 96(%edi),%xmm1
vmovdqa %ymm2,-64(%edx)
vmovdqu 112(%edi),%xmm2
vmovdqa %ymm3,-32(%edx)
vmovdqu 128(%edi),%xmm3
vmovdqa %ymm4,(%edx)
vpermq $64,%ymm0,%ymm0
vpermq $64,%ymm1,%ymm1
vpermq $64,%ymm2,%ymm2
vpermq $64,%ymm3,%ymm3
vpshufd $200,%ymm0,%ymm0
vpshufd $200,%ymm1,%ymm1
vpshufd $200,%ymm2,%ymm2
vpshufd $200,%ymm3,%ymm3
vmovdqa %ymm0,32(%edx)
vmovd -48(%edi),%xmm0
vmovdqa %ymm1,64(%edx)
vmovd -44(%edi),%xmm1
vmovdqa %ymm2,96(%edx)
vmovd -40(%edi),%xmm2
vmovdqa %ymm3,128(%edx)
vmovd -36(%edi),%xmm3
vmovd -32(%edi),%xmm4
vmovdqa 64(%ebx),%ymm7
negl %eax
testl $63,%ecx
jz .L024even
movl %ecx,%edx
andl $-64,%ecx
andl $63,%edx
vmovdqu (%esi),%xmm5
cmpl $32,%edx
jb .L025one
vmovdqu 16(%esi),%xmm6
je .L026two
vinserti128 $1,32(%esi),%ymm5,%ymm5
leal 48(%esi),%esi
leal 8(%ebx),%ebx
leal 296(%esp),%edx
jmp .L027tail
.L026two:
leal 32(%esi),%esi
leal 16(%ebx),%ebx
leal 304(%esp),%edx
jmp .L027tail
.L025one:
leal 16(%esi),%esi
vpxor %ymm6,%ymm6,%ymm6
leal 32(%ebx,%eax,8),%ebx
leal 312(%esp),%edx
jmp .L027tail
.align 32
.L024even:
vmovdqu (%esi),%xmm5
vmovdqu 16(%esi),%xmm6
vinserti128 $1,32(%esi),%ymm5,%ymm5
vinserti128 $1,48(%esi),%ymm6,%ymm6
leal 64(%esi),%esi
subl $64,%ecx
jz .L027tail
.L028loop:
vmovdqa %ymm2,64(%esp)
vpsrldq $6,%ymm5,%ymm2
vmovdqa %ymm0,(%esp)
vpsrldq $6,%ymm6,%ymm0
vmovdqa %ymm1,32(%esp)
vpunpckhqdq %ymm6,%ymm5,%ymm1
vpunpcklqdq %ymm6,%ymm5,%ymm5
vpunpcklqdq %ymm0,%ymm2,%ymm2
vpsrlq $30,%ymm2,%ymm0
vpsrlq $4,%ymm2,%ymm2
vpsrlq $26,%ymm5,%ymm6
vpsrlq $40,%ymm1,%ymm1
vpand %ymm7,%ymm2,%ymm2
vpand %ymm7,%ymm5,%ymm5
vpand %ymm7,%ymm6,%ymm6
vpand %ymm7,%ymm0,%ymm0
vpor (%ebx),%ymm1,%ymm1
vpaddq 64(%esp),%ymm2,%ymm2
vpaddq (%esp),%ymm5,%ymm5
vpaddq 32(%esp),%ymm6,%ymm6
vpaddq %ymm3,%ymm0,%ymm0
vpaddq %ymm4,%ymm1,%ymm1
vpmuludq -96(%edx),%ymm2,%ymm3
vmovdqa %ymm6,32(%esp)
vpmuludq -64(%edx),%ymm2,%ymm4
vmovdqa %ymm0,96(%esp)
vpmuludq 96(%edx),%ymm2,%ymm0
vmovdqa %ymm1,128(%esp)
vpmuludq 128(%edx),%ymm2,%ymm1
vpmuludq -128(%edx),%ymm2,%ymm2
vpmuludq -32(%edx),%ymm5,%ymm7
vpaddq %ymm7,%ymm3,%ymm3
vpmuludq (%edx),%ymm5,%ymm6
vpaddq %ymm6,%ymm4,%ymm4
vpmuludq -128(%edx),%ymm5,%ymm7
vpaddq %ymm7,%ymm0,%ymm0
vmovdqa 32(%esp),%ymm7
vpmuludq -96(%edx),%ymm5,%ymm6
vpaddq %ymm6,%ymm1,%ymm1
vpmuludq -64(%edx),%ymm5,%ymm5
vpaddq %ymm5,%ymm2,%ymm2
vpmuludq -64(%edx),%ymm7,%ymm6
vpaddq %ymm6,%ymm3,%ymm3
vpmuludq -32(%edx),%ymm7,%ymm5
vpaddq %ymm5,%ymm4,%ymm4
vpmuludq 128(%edx),%ymm7,%ymm6
vpaddq %ymm6,%ymm0,%ymm0
vmovdqa 96(%esp),%ymm6
vpmuludq -128(%edx),%ymm7,%ymm5
vpaddq %ymm5,%ymm1,%ymm1
vpmuludq -96(%edx),%ymm7,%ymm7
vpaddq %ymm7,%ymm2,%ymm2
vpmuludq -128(%edx),%ymm6,%ymm5
vpaddq %ymm5,%ymm3,%ymm3
vpmuludq -96(%edx),%ymm6,%ymm7
vpaddq %ymm7,%ymm4,%ymm4
vpmuludq 64(%edx),%ymm6,%ymm5
vpaddq %ymm5,%ymm0,%ymm0
vmovdqa 128(%esp),%ymm5
vpmuludq 96(%edx),%ymm6,%ymm7
vpaddq %ymm7,%ymm1,%ymm1
vpmuludq 128(%edx),%ymm6,%ymm6
vpaddq %ymm6,%ymm2,%ymm2
vpmuludq 128(%edx),%ymm5,%ymm7
vpaddq %ymm7,%ymm3,%ymm3
vpmuludq 32(%edx),%ymm5,%ymm6
vpaddq %ymm6,%ymm0,%ymm0
vpmuludq -128(%edx),%ymm5,%ymm7
vpaddq %ymm7,%ymm4,%ymm4
vmovdqa 64(%ebx),%ymm7
vpmuludq 64(%edx),%ymm5,%ymm6
vpaddq %ymm6,%ymm1,%ymm1
vpmuludq 96(%edx),%ymm5,%ymm5
vpaddq %ymm5,%ymm2,%ymm2
vpsrlq $26,%ymm3,%ymm5
vpand %ymm7,%ymm3,%ymm3
vpsrlq $26,%ymm0,%ymm6
vpand %ymm7,%ymm0,%ymm0
vpaddq %ymm5,%ymm4,%ymm4
vpaddq %ymm6,%ymm1,%ymm1
vpsrlq $26,%ymm4,%ymm5
vpand %ymm7,%ymm4,%ymm4
vpsrlq $26,%ymm1,%ymm6
vpand %ymm7,%ymm1,%ymm1
vpaddq %ymm6,%ymm2,%ymm2
vpaddq %ymm5,%ymm0,%ymm0
vpsllq $2,%ymm5,%ymm5
vpsrlq $26,%ymm2,%ymm6
vpand %ymm7,%ymm2,%ymm2
vpaddq %ymm5,%ymm0,%ymm0
vpaddq %ymm6,%ymm3,%ymm3
vpsrlq $26,%ymm3,%ymm6
vpsrlq $26,%ymm0,%ymm5
vpand %ymm7,%ymm0,%ymm0
vpand %ymm7,%ymm3,%ymm3
vpaddq %ymm5,%ymm1,%ymm1
vpaddq %ymm6,%ymm4,%ymm4
vmovdqu (%esi),%xmm5
vmovdqu 16(%esi),%xmm6
vinserti128 $1,32(%esi),%ymm5,%ymm5
vinserti128 $1,48(%esi),%ymm6,%ymm6
leal 64(%esi),%esi
subl $64,%ecx
jnz .L028loop
.L027tail:
vmovdqa %ymm2,64(%esp)
vpsrldq $6,%ymm5,%ymm2
vmovdqa %ymm0,(%esp)
vpsrldq $6,%ymm6,%ymm0
vmovdqa %ymm1,32(%esp)
vpunpckhqdq %ymm6,%ymm5,%ymm1
vpunpcklqdq %ymm6,%ymm5,%ymm5
vpunpcklqdq %ymm0,%ymm2,%ymm2
vpsrlq $30,%ymm2,%ymm0
vpsrlq $4,%ymm2,%ymm2
vpsrlq $26,%ymm5,%ymm6
vpsrlq $40,%ymm1,%ymm1
vpand %ymm7,%ymm2,%ymm2
vpand %ymm7,%ymm5,%ymm5
vpand %ymm7,%ymm6,%ymm6
vpand %ymm7,%ymm0,%ymm0
vpor (%ebx),%ymm1,%ymm1
andl $-64,%ebx
vpaddq 64(%esp),%ymm2,%ymm2
vpaddq (%esp),%ymm5,%ymm5
vpaddq 32(%esp),%ymm6,%ymm6
vpaddq %ymm3,%ymm0,%ymm0
vpaddq %ymm4,%ymm1,%ymm1
vpmuludq -92(%edx),%ymm2,%ymm3
vmovdqa %ymm6,32(%esp)
vpmuludq -60(%edx),%ymm2,%ymm4
vmovdqa %ymm0,96(%esp)
vpmuludq 100(%edx),%ymm2,%ymm0
vmovdqa %ymm1,128(%esp)
vpmuludq 132(%edx),%ymm2,%ymm1
vpmuludq -124(%edx),%ymm2,%ymm2
vpmuludq -28(%edx),%ymm5,%ymm7
vpaddq %ymm7,%ymm3,%ymm3
vpmuludq 4(%edx),%ymm5,%ymm6
vpaddq %ymm6,%ymm4,%ymm4
vpmuludq -124(%edx),%ymm5,%ymm7
vpaddq %ymm7,%ymm0,%ymm0
vmovdqa 32(%esp),%ymm7
vpmuludq -92(%edx),%ymm5,%ymm6
vpaddq %ymm6,%ymm1,%ymm1
vpmuludq -60(%edx),%ymm5,%ymm5
vpaddq %ymm5,%ymm2,%ymm2
vpmuludq -60(%edx),%ymm7,%ymm6
vpaddq %ymm6,%ymm3,%ymm3
vpmuludq -28(%edx),%ymm7,%ymm5
vpaddq %ymm5,%ymm4,%ymm4
vpmuludq 132(%edx),%ymm7,%ymm6
vpaddq %ymm6,%ymm0,%ymm0
vmovdqa 96(%esp),%ymm6
vpmuludq -124(%edx),%ymm7,%ymm5
vpaddq %ymm5,%ymm1,%ymm1
vpmuludq -92(%edx),%ymm7,%ymm7
vpaddq %ymm7,%ymm2,%ymm2
vpmuludq -124(%edx),%ymm6,%ymm5
vpaddq %ymm5,%ymm3,%ymm3
vpmuludq -92(%edx),%ymm6,%ymm7
vpaddq %ymm7,%ymm4,%ymm4
vpmuludq 68(%edx),%ymm6,%ymm5
vpaddq %ymm5,%ymm0,%ymm0
vmovdqa 128(%esp),%ymm5
vpmuludq 100(%edx),%ymm6,%ymm7
vpaddq %ymm7,%ymm1,%ymm1
vpmuludq 132(%edx),%ymm6,%ymm6
vpaddq %ymm6,%ymm2,%ymm2
vpmuludq 132(%edx),%ymm5,%ymm7
vpaddq %ymm7,%ymm3,%ymm3
vpmuludq 36(%edx),%ymm5,%ymm6
vpaddq %ymm6,%ymm0,%ymm0
vpmuludq -124(%edx),%ymm5,%ymm7
vpaddq %ymm7,%ymm4,%ymm4
vmovdqa 64(%ebx),%ymm7
vpmuludq 68(%edx),%ymm5,%ymm6
vpaddq %ymm6,%ymm1,%ymm1
vpmuludq 100(%edx),%ymm5,%ymm5
vpaddq %ymm5,%ymm2,%ymm2
vpsrldq $8,%ymm4,%ymm5
vpsrldq $8,%ymm3,%ymm6
vpaddq %ymm5,%ymm4,%ymm4
vpsrldq $8,%ymm0,%ymm5
vpaddq %ymm6,%ymm3,%ymm3
vpsrldq $8,%ymm1,%ymm6
vpaddq %ymm5,%ymm0,%ymm0
vpsrldq $8,%ymm2,%ymm5
vpaddq %ymm6,%ymm1,%ymm1
vpermq $2,%ymm4,%ymm6
vpaddq %ymm5,%ymm2,%ymm2
vpermq $2,%ymm3,%ymm5
vpaddq %ymm6,%ymm4,%ymm4
vpermq $2,%ymm0,%ymm6
vpaddq %ymm5,%ymm3,%ymm3
vpermq $2,%ymm1,%ymm5
vpaddq %ymm6,%ymm0,%ymm0
vpermq $2,%ymm2,%ymm6
vpaddq %ymm5,%ymm1,%ymm1
vpaddq %ymm6,%ymm2,%ymm2
vpsrlq $26,%ymm3,%ymm5
vpand %ymm7,%ymm3,%ymm3
vpsrlq $26,%ymm0,%ymm6
vpand %ymm7,%ymm0,%ymm0
vpaddq %ymm5,%ymm4,%ymm4
vpaddq %ymm6,%ymm1,%ymm1
vpsrlq $26,%ymm4,%ymm5
vpand %ymm7,%ymm4,%ymm4
vpsrlq $26,%ymm1,%ymm6
vpand %ymm7,%ymm1,%ymm1
vpaddq %ymm6,%ymm2,%ymm2
vpaddq %ymm5,%ymm0,%ymm0
vpsllq $2,%ymm5,%ymm5
vpsrlq $26,%ymm2,%ymm6
vpand %ymm7,%ymm2,%ymm2
vpaddq %ymm5,%ymm0,%ymm0
vpaddq %ymm6,%ymm3,%ymm3
vpsrlq $26,%ymm3,%ymm6
vpsrlq $26,%ymm0,%ymm5
vpand %ymm7,%ymm0,%ymm0
vpand %ymm7,%ymm3,%ymm3
vpaddq %ymm5,%ymm1,%ymm1
vpaddq %ymm6,%ymm4,%ymm4
cmpl $0,%ecx
je .L029done
vpshufd $252,%xmm0,%xmm0
leal 288(%esp),%edx
vpshufd $252,%xmm1,%xmm1
vpshufd $252,%xmm2,%xmm2
vpshufd $252,%xmm3,%xmm3
vpshufd $252,%xmm4,%xmm4
jmp .L024even
.align 16
.L029done:
vmovd %xmm0,-48(%edi)
vmovd %xmm1,-44(%edi)
vmovd %xmm2,-40(%edi)
vmovd %xmm3,-36(%edi)
vmovd %xmm4,-32(%edi)
vzeroupper
movl %ebp,%esp
.L020nodata:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2
.align 64
.Lconst_sse2:
.long 16777216,0,16777216,0,16777216,0,16777216,0
.long 0,0,0,0,0,0,0,0
.long 67108863,0,67108863,0,67108863,0,67108863,0
.long 268435455,268435452,268435452,268435452
.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
.byte 114,103,62,0
.align 4
.comm OPENSSL_ia32cap_P,16,4
#endif