[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[SCM] GNU gnutls branch, master, updated. gnutls_3_0_17-14-g9567d93
From: |
Nikos Mavrogiannopoulos |
Subject: |
[SCM] GNU gnutls branch, master, updated. gnutls_3_0_17-14-g9567d93 |
Date: |
Mon, 19 Mar 2012 21:58:37 +0000 |
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU gnutls".
http://git.savannah.gnu.org/cgit/gnutls.git/commit/?id=9567d93c07f87ecb5c8560b7a45125de28710bc1
The branch, master has been updated
via 9567d93c07f87ecb5c8560b7a45125de28710bc1 (commit)
from abbfc182f738c654ebeaf75cf6893acc0947699b (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
commit 9567d93c07f87ecb5c8560b7a45125de28710bc1
Author: Nikos Mavrogiannopoulos <address@hidden>
Date: Mon Mar 19 22:55:14 2012 +0100
updated openssl code
-----------------------------------------------------------------------
Summary of changes:
NEWS | 2 +
devel/perlasm/e_padlock-x86.pl | 104 +++++++--
devel/perlasm/e_padlock-x86_64.pl | 178 ++++++++++-----
devel/perlasm/ghash-x86.pl | 28 ++--
lib/accelerated/x86/README | 4 +-
lib/accelerated/x86/coff/padlock-x86-64-coff.s | 162 +++++++++++----
lib/accelerated/x86/coff/padlock-x86-coff.s | 232 ++++++++++++++------
lib/accelerated/x86/elf/padlock-x86-64.s | 162 +++++++++++----
lib/accelerated/x86/license.txt | 2 +-
lib/accelerated/x86/macosx/padlock-x86-64-macosx.s | 162 +++++++++++----
lib/accelerated/x86/macosx/padlock-x86-macosx.s | 234 ++++++++++++++------
11 files changed, 924 insertions(+), 346 deletions(-)
diff --git a/NEWS b/NEWS
index 27a258c..93fa1ab 100644
--- a/NEWS
+++ b/NEWS
@@ -7,6 +7,8 @@ See the end for copying conditions.
** certtool: Avoid a Y2K38 bug when generating certificates.
Patch by Robert Millan.
+** libgnutls: Updated assembler files.
+
** libgnutls: Time in generated certificates is stored
as GeneralizedTime instead of UTCTime (which only stores
2 digits of a year).
diff --git a/devel/perlasm/e_padlock-x86.pl b/devel/perlasm/e_padlock-x86.pl
index 7a52528..71ecad3 100644
--- a/devel/perlasm/e_padlock-x86.pl
+++ b/devel/perlasm/e_padlock-x86.pl
@@ -37,7 +37,7 @@ require "x86asm.pl";
&asm_init($ARGV[0],$0);
-%PADLOCK_MARGIN=(ecb=>128, cbc=>64); # prefetch errata
+%PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata
$PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16
$ctx="edx";
@@ -188,10 +188,6 @@ my ($mode,$opcode) = @_;
&movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter
} else {
&xor ("ebx","ebx");
- if ($PADLOCK_MARGIN{$mode}) {
- &cmp ($len,$PADLOCK_MARGIN{$mode});
- &jbe (&label("${mode}_short"));
- }
&test (&DWP(0,$ctx),1<<5); # align bit in control word
&jnz (&label("${mode}_aligned"));
&test ($out,0x0f);
@@ -212,7 +208,27 @@ my ($mode,$opcode) = @_;
&neg ("eax");
&and ($chunk,$PADLOCK_CHUNK-1); # chunk=len%PADLOCK_CHUNK
&lea ("esp",&DWP(0,"eax","ebp")); # alloca
+ &mov ("eax",$PADLOCK_CHUNK);
+ &cmovz ($chunk,"eax"); # chunk=chunk?:PADLOCK_CHUNK
+ &mov ("eax","ebp");
+ &and ("ebp",-16);
&and ("esp",-16);
+ &mov (&DWP(16,"ebp"),"eax");
+ if ($PADLOCK_PREFETCH{$mode}) {
+ &cmp ($len,$chunk);
+ &ja (&label("${mode}_loop"));
+ &mov ("eax",$inp); # check if prefetch crosses page
+ &cmp ("ebp","esp");
+ &cmove ("eax",$out);
+ &add ("eax",$len);
+ &neg ("eax");
+ &and ("eax",0xfff); # distance to page boundary
+ &cmp ("eax",$PADLOCK_PREFETCH{$mode});
+ &mov ("eax",-$PADLOCK_PREFETCH{$mode});
+ &cmovae ("eax",$chunk); # mask=distance<prefetch?-prefetch:-1
+ &and ($chunk,"eax");
+ &jz (&label("${mode}_unaligned_tail"));
+ }
&jmp (&label("${mode}_loop"));
&set_label("${mode}_loop",16);
@@ -276,8 +292,8 @@ my ($mode,$opcode) = @_;
&test ($out,0x0f);
&jz (&label("${mode}_out_aligned"));
&mov ($len,$chunk);
- &shr ($len,2);
&lea ($inp,&DWP(0,"esp"));
+ &shr ($len,2);
&data_byte(0xf3,0xa5); # rep movsl
&sub ($out,$chunk);
&set_label("${mode}_out_aligned");
@@ -288,7 +304,30 @@ my ($mode,$opcode) = @_;
&add ($inp,$chunk);
&sub ($len,$chunk);
&mov ($chunk,$PADLOCK_CHUNK);
+ if (!$PADLOCK_PREFETCH{$mode}) {
&jnz (&label("${mode}_loop"));
+ } else {
+ &jz (&label("${mode}_break"));
+ &cmp ($len,$chunk);
+ &jae (&label("${mode}_loop"));
+
+&set_label("${mode}_unaligned_tail");
+ &xor ("eax","eax");
+ &cmp ("esp","ebp");
+ &cmove ("eax",$len);
+ &sub ("esp","eax"); # alloca
+ &mov ("eax", $out); # save parameters
+ &mov ($chunk,$len);
+ &shr ($len,2);
+ &lea ($out,&DWP(0,"esp"));
+ &data_byte(0xf3,0xa5); # rep movsl
+ &mov ($inp,"esp");
+ &mov ($out,"eax"); # restore parameters
+ &mov ($len,$chunk);
+ &jmp (&label("${mode}_loop"));
+
+&set_label("${mode}_break",16);
+ }
if ($mode ne "ctr32") {
&cmp ("esp","ebp");
&je (&label("${mode}_done"));
@@ -302,28 +341,24 @@ my ($mode,$opcode) = @_;
&ja (&label("${mode}_bzero"));
&set_label("${mode}_done");
+ &mov ("ebp",&DWP(16,"ebp"));
&lea ("esp",&DWP(24,"ebp"));
if ($mode ne "ctr32") {
&jmp (&label("${mode}_exit"));
-&set_label("${mode}_short",16);
- &xor ("eax","eax");
- &lea ("ebp",&DWP(-24,"esp"));
- &sub ("eax",$len);
- &lea ("esp",&DWP(0,"eax","ebp"));
- &and ("esp",-16);
- &xor ($chunk,$chunk);
-&set_label("${mode}_short_copy");
- &movups ("xmm0",&QWP(0,$inp,$chunk));
- &lea ($chunk,&DWP(16,$chunk));
- &cmp ($len,$chunk);
- &movaps (&QWP(-16,"esp",$chunk),"xmm0");
- &ja (&label("${mode}_short_copy"));
- &mov ($inp,"esp");
- &mov ($chunk,$len);
- &jmp (&label("${mode}_loop"));
-
&set_label("${mode}_aligned",16);
+ if ($PADLOCK_PREFETCH{$mode}) {
+ &lea ("ebp",&DWP(0,$inp,$len));
+ &neg ("ebp");
+ &and ("ebp",0xfff); # distance to page boundary
+ &xor ("eax","eax");
+ &cmp ("ebp",$PADLOCK_PREFETCH{$mode});
+ &mov ("ebp",$PADLOCK_PREFETCH{$mode}-1);
+ &cmovae ("ebp","eax");
+ &and ("ebp",$len); # remainder
+ &sub ($len,"ebp");
+ &jz (&label("${mode}_aligned_tail"));
+ }
&lea ("eax",&DWP(-16,$ctx)); # ivp
&lea ("ebx",&DWP(16,$ctx)); # key
&shr ($len,4); # len/=AES_BLOCK_SIZE
@@ -332,6 +367,29 @@ my ($mode,$opcode) = @_;
&movaps ("xmm0",&QWP(0,"eax"));
&movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv
}
+ if ($PADLOCK_PREFETCH{$mode}) {
+ &test ("ebp","ebp");
+ &jz (&label("${mode}_exit"));
+
+&set_label("${mode}_aligned_tail");
+ &mov ($len,"ebp");
+ &lea ("ebp",&DWP(-24,"esp"));
+ &mov ("esp","ebp");
+ &mov ("eax","ebp");
+ &sub ("esp",$len);
+ &and ("ebp",-16);
+ &and ("esp",-16);
+ &mov (&DWP(16,"ebp"),"eax");
+ &mov ("eax", $out); # save parameters
+ &mov ($chunk,$len);
+ &shr ($len,2);
+ &lea ($out,&DWP(0,"esp"));
+ &data_byte(0xf3,0xa5); # rep movsl
+ &mov ($inp,"esp");
+ &mov ($out,"eax"); # restore parameters
+ &mov ($len,$chunk);
+ &jmp (&label("${mode}_loop"));
+ }
&set_label("${mode}_exit"); }
&mov ("eax",1);
&lea ("esp",&DWP(4,"esp")); # popf
diff --git a/devel/perlasm/e_padlock-x86_64.pl
b/devel/perlasm/e_padlock-x86_64.pl
index cbffb9d..4d71d06 100644
--- a/devel/perlasm/e_padlock-x86_64.pl
+++ b/devel/perlasm/e_padlock-x86_64.pl
@@ -27,7 +27,7 @@ open STDOUT,"| $^X $xlate $flavour $output";
$code=".text\n";
-%PADLOCK_MARGIN=(ecb=>128, cbc=>64, ctr32=>64); # prefetch errata
+%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata
$PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
$ctx="%rdx";
@@ -285,17 +285,6 @@ padlock_${mode}_encrypt:
lea 16($ctx),$ctx # control word
xor %eax,%eax
xor %ebx,%ebx
-___
-# Formally speaking correct condtion is $len<=$margin and $inp+$margin
-# crosses page boundary [and next page is unreadable]. But $inp can
-# be unaligned in which case data can be copied to $out if latter is
-# aligned, in which case $out+$margin has to be checked. Covering all
-# cases appears more complicated than just copying short input...
-$code.=<<___ if ($PADLOCK_MARGIN{$mode});
- cmp \$$PADLOCK_MARGIN{$mode},$len
- jbe .L${mode}_short
-___
-$code.=<<___;
testl \$`1<<5`,($ctx) # align bit in control word
jnz .L${mode}_aligned
test \$0x0f,$out
@@ -315,6 +304,8 @@ $code.=<<___;
neg %rax
and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
lea (%rax,%rbp),%rsp
+ mov \$$PADLOCK_CHUNK,%rax
+ cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK
___
$code.=<<___ if ($mode eq "ctr32");
.L${mode}_reenter:
@@ -322,10 +313,27 @@ $code.=<<___ if ($mode eq
"ctr32");
bswap %eax
neg %eax
and \$`$PADLOCK_CHUNK/16-1`,%eax
- jz .L${mode}_loop
+ mov \$$PADLOCK_CHUNK,$chunk
shl \$4,%eax
+ cmovz $chunk,%rax
cmp %rax,$len
cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
+ cmovbe $len,$chunk
+___
+$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
+ cmp $chunk,$len
+ ja .L${mode}_loop
+ mov $inp,%rax # check if prefetch crosses page
+ cmp %rsp,%rbp
+ cmove $out,%rax
+ add $len,%rax
+ neg %rax
+ and \$0xfff,%rax # distance to page boundary
+ cmp \$$PADLOCK_PREFETCH{$mode},%rax
+ mov \$-$PADLOCK_PREFETCH{$mode},%rax
+ cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1
+ and %rax,$chunk
+ jz .L${mode}_unaligned_tail
___
$code.=<<___;
jmp .L${mode}_loop
@@ -360,12 +368,12 @@ ___
$code.=<<___ if ($mode eq "ctr32");
mov -4($ctx),%eax # pull 32-bit counter
test \$0xffff0000,%eax
- jnz .L${mode}_no_corr
+ jnz .L${mode}_no_carry
bswap %eax
add \$0x10000,%eax
bswap %eax
mov %eax,-4($ctx)
-.L${mode}_no_corr:
+.L${mode}_no_carry:
___
$code.=<<___;
mov %r8,$out # restore paramters
@@ -373,8 +381,8 @@ $code.=<<___;
test \$0x0f,$out
jz .L${mode}_out_aligned
mov $chunk,$len
- shr \$3,$len
lea (%rsp),$inp
+ shr \$3,$len
.byte 0xf3,0x48,0xa5 # rep movsq
sub $chunk,$out
.L${mode}_out_aligned:
@@ -384,9 +392,52 @@ $code.=<<___;
add $chunk,$inp
sub $chunk,$len
mov \$$PADLOCK_CHUNK,$chunk
+___
+ if (!$PADLOCK_PREFETCH{$mode}) {
+$code.=<<___;
jnz .L${mode}_loop
-
+___
+ } else {
+$code.=<<___;
+ jz .L${mode}_break
+ cmp $chunk,$len
+ jae .L${mode}_loop
+___
+$code.=<<___ if ($mode eq "ctr32");
+ mov $len,$chunk
+ mov $inp,%rax # check if prefetch crosses page
cmp %rsp,%rbp
+ cmove $out,%rax
+ add $len,%rax
+ neg %rax
+ and \$0xfff,%rax # distance to page boundary
+ cmp \$$PADLOCK_PREFETCH{$mode},%rax
+ mov \$-$PADLOCK_PREFETCH{$mode},%rax
+ cmovae $chunk,%rax
+ and %rax,$chunk
+ jnz .L${mode}_loop
+___
+$code.=<<___;
+.L${mode}_unaligned_tail:
+ xor %eax,%eax
+ cmp %rsp,%rbp
+ cmove $len,%rax
+ mov $out,%r8 # save parameters
+ mov $len,$chunk
+ sub %rax,%rsp # alloca
+ shr \$3,$len
+ lea (%rsp),$out
+ .byte 0xf3,0x48,0xa5 # rep movsq
+ mov %rsp,$inp
+ mov %r8, $out # restore parameters
+ mov $chunk,$len
+ jmp .L${mode}_loop
+.align 16
+.L${mode}_break:
+___
+ }
+$code.=<<___;
+ cmp %rbp,%rsp
je .L${mode}_done
pxor %xmm0,%xmm0
@@ -400,70 +451,87 @@ $code.=<<___;
.L${mode}_done:
lea (%rbp),%rsp
jmp .L${mode}_exit
-___
-$code.=<<___ if ($PADLOCK_MARGIN{$mode});
-.align 16
-.L${mode}_short:
- mov %rsp,%rbp
- sub $len,%rsp
- xor $chunk,$chunk
-.L${mode}_short_copy:
- movups ($inp,$chunk),%xmm0
- lea 16($chunk),$chunk
- cmp $chunk,$len
- movaps %xmm0,-16(%rsp,$chunk)
- ja .L${mode}_short_copy
- mov %rsp,$inp
- mov $len,$chunk
- jmp .L${mode}_`${mode} eq "ctr32"?"reenter":"loop"`
-___
-$code.=<<___;
+
.align 16
.L${mode}_aligned:
___
$code.=<<___ if ($mode eq "ctr32");
mov -4($ctx),%eax # pull 32-bit counter
- mov \$`16*0x10000`,$chunk
bswap %eax
- cmp $len,$chunk
- cmova $len,$chunk
neg %eax
and \$0xffff,%eax
- jz .L${mode}_aligned_loop
+ mov \$`16*0x10000`,$chunk
shl \$4,%eax
+ cmovz $chunk,%rax
cmp %rax,$len
cmova %rax,$chunk # don't let counter cross 2^16
- jmp .L${mode}_aligned_loop
-.align 16
+ cmovbe $len,$chunk
+ jbe .L${mode}_aligned_skip
+
.L${mode}_aligned_loop:
- cmp $len,$chunk
- cmova $len,$chunk
mov $len,%r10 # save parameters
mov $chunk,$len
mov $chunk,%r11
-___
-$code.=<<___;
+
lea -16($ctx),%rax # ivp
lea 16($ctx),%rbx # key
shr \$4,$len # len/=AES_BLOCK_SIZE
.byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
-___
-$code.=<<___ if ($mode !~ /ecb|ctr/);
- movdqa (%rax),%xmm0
- movdqa %xmm0,-16($ctx) # copy [or refresh] iv
-___
-$code.=<<___ if ($mode eq "ctr32");
+
mov -4($ctx),%eax # pull 32-bit counter
bswap %eax
add \$0x10000,%eax
bswap %eax
mov %eax,-4($ctx)
- mov %r11,$chunk # restore paramters
- mov %r10,$len
- sub $chunk,$len
+ mov %r10,$len # restore paramters
+ sub %r11,$len
mov \$`16*0x10000`,$chunk
- jnz .L${mode}_aligned_loop
+ jz .L${mode}_exit
+ cmp $chunk,$len
+ jae .L${mode}_aligned_loop
+
+.L${mode}_aligned_skip:
+___
+$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
+ lea ($inp,$len),%rbp
+ neg %rbp
+ and \$0xfff,%rbp # distance to page boundary
+ xor %eax,%eax
+ cmp \$$PADLOCK_PREFETCH{$mode},%rbp
+ mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp
+ cmovae %rax,%rbp
+ and $len,%rbp # remainder
+ sub %rbp,$len
+ jz .L${mode}_aligned_tail
+___
+$code.=<<___;
+ lea -16($ctx),%rax # ivp
+ lea 16($ctx),%rbx # key
+ shr \$4,$len # len/=AES_BLOCK_SIZE
+ .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
+___
+$code.=<<___ if ($mode !~ /ecb|ctr/);
+ movdqa (%rax),%xmm0
+ movdqa %xmm0,-16($ctx) # copy [or refresh] iv
+___
+$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
+ test %rbp,%rbp # check remainder
+ jz .L${mode}_exit
+
+.L${mode}_aligned_tail:
+ mov $out,%r8
+ mov %rbp,$chunk
+ mov %rbp,$len
+ lea (%rsp),%rbp
+ sub $len,%rsp
+ shr \$3,$len
+ lea (%rsp),$out
+ .byte 0xf3,0x48,0xa5 # rep movsq
+ lea (%r8),$out
+ lea (%rsp),$inp
+ mov $chunk,$len
+ jmp .L${mode}_loop
___
$code.=<<___;
.L${mode}_exit:
diff --git a/devel/perlasm/ghash-x86.pl b/devel/perlasm/ghash-x86.pl
index 1b9adfb..2a1819c 100644
--- a/devel/perlasm/ghash-x86.pl
+++ b/devel/perlasm/ghash-x86.pl
@@ -12,14 +12,14 @@
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
-# code paths: vanilla x86 and vanilla MMX. Former will be executed on
-# 486 and Pentium, latter on all others. MMX GHASH features so called
+# code paths: vanilla x86 and vanilla SSE. Former will be executed on
+# 486 and Pentium, latter on all others. SSE GHASH features so called
# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
# of per-key storage [+512 bytes shared table]. Performance results
# are for streamed GHASH subroutine and are expressed in cycles per
# processed byte, less is better:
#
-# gcc 2.95.3(*) MMX assembler x86 assembler
+# gcc 2.95.3(*) SSE assembler x86 assembler
#
# Pentium 105/111(**) - 50
# PIII 68 /75 12.2 24
@@ -30,7 +30,7 @@
# (*) gcc 3.4.x was observed to generate few percent slower code,
# which is one of reasons why 2.95.3 results were chosen,
# another reason is lack of 3.4.x results for older CPUs;
-# comparison with MMX results is not completely fair, because C
+# comparison with SSE results is not completely fair, because C
# results are for vanilla "256B" implementation, while
# assembler results are for "528B";-)
# (**) second number is result for code compiled with -fPIC flag,
@@ -40,8 +40,8 @@
#
# To summarize, it's >2-5 times faster than gcc-generated code. To
# anchor it to something else SHA1 assembler processes one byte in
-# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
-# particular, see comment at the end of the file...
+# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE
+# in particular, see comment at the end of the file...
# May 2010
#
@@ -331,7 +331,7 @@ if (!$x86only) {{{
&static_label("rem_4bit");
-if (0) {{ # "May" MMX version is kept for reference...
+if (!$sse2) {{ # pure-MMX "May" version...
$S=12; # shift factor for rem_4bit
@@ -1273,13 +1273,6 @@ my ($Xhi,$Xi)address@hidden;
&set_label("bswap",64);
&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
-}} # $sse2
-
-&set_label("rem_4bit",64);
- &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
- &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
- &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
- &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
&set_label("rem_8bit",64);
&data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
&data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
@@ -1313,6 +1306,13 @@ my ($Xhi,$Xi)address@hidden;
&data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
&data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
&data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
+}} # $sse2
+
+&set_label("rem_4bit",64);
+ &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
+ &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
+ &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
+ &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
}}} # !$x86only
&asciz("GHASH for x86, CRYPTOGAMS by <address@hidden>");
diff --git a/lib/accelerated/x86/README b/lib/accelerated/x86/README
index 0dd5cb9..ca3c546 100644
--- a/lib/accelerated/x86/README
+++ b/lib/accelerated/x86/README
@@ -1,4 +1,4 @@
-The AES-NI and Padlock implementation by Andy Polyakov is not part of the
-GnuTLS library, but is used with GnuTLS. Its license is included in
+The AES-NI and Padlock implementation by Andy Polyakov are not part of the
+GnuTLS library, but is used with GnuTLS. Their license is included in
license.txt.
diff --git a/lib/accelerated/x86/coff/padlock-x86-64-coff.s
b/lib/accelerated/x86/coff/padlock-x86-64-coff.s
index b69b332..9f658ee 100644
--- a/lib/accelerated/x86/coff/padlock-x86-64-coff.s
+++ b/lib/accelerated/x86/coff/padlock-x86-64-coff.s
@@ -354,8 +354,6 @@ padlock_ecb_encrypt:
leaq 16(%rdx),%rdx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpq $128,%rcx
- jbe .Lecb_short
testl $32,(%rdx)
jnz .Lecb_aligned
testq $15,%rdi
@@ -375,6 +373,21 @@ padlock_ecb_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ cmpq %rbx,%rcx
+ ja .Lecb_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $128,%rax
+ movq $-128,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz .Lecb_unaligned_tail
jmp .Lecb_loop
.p2align 4
.Lecb_loop:
@@ -404,8 +417,8 @@ padlock_ecb_encrypt:
testq $15,%rdi
jz .Lecb_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
+ shrq $3,%rcx
.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
.Lecb_out_aligned:
@@ -415,9 +428,26 @@ padlock_ecb_encrypt:
addq %rbx,%rsi
subq %rbx,%rcx
movq $512,%rbx
- jnz .Lecb_loop
-
+ jz .Lecb_break
+ cmpq %rbx,%rcx
+ jae .Lecb_loop
+.Lecb_unaligned_tail:
+ xorl %eax,%eax
cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp .Lecb_loop
+.p2align 4
+.Lecb_break:
+ cmpq %rbp,%rsp
je .Lecb_done
pxor %xmm0,%xmm0
@@ -431,26 +461,39 @@ padlock_ecb_encrypt:
.Lecb_done:
leaq (%rbp),%rsp
jmp .Lecb_exit
-.p2align 4
-.Lecb_short:
- movq %rsp,%rbp
- subq %rcx,%rsp
- xorq %rbx,%rbx
-.Lecb_short_copy:
- movups (%rsi,%rbx,1),%xmm0
- leaq 16(%rbx),%rbx
- cmpq %rbx,%rcx
- movaps %xmm0,-16(%rsp,%rbx,1)
- ja .Lecb_short_copy
- movq %rsp,%rsi
- movq %rcx,%rbx
- jmp .Lecb_loop
+
.p2align 4
.Lecb_aligned:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $128,%rbp
+ movq $128-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz .Lecb_aligned_tail
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
.byte 0xf3,0x0f,0xa7,200
+ testq %rbp,%rbp
+ jz .Lecb_exit
+
+.Lecb_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp .Lecb_loop
.Lecb_exit:
movl $1,%eax
leaq 8(%rsp),%rsp
@@ -489,8 +532,6 @@ padlock_cbc_encrypt:
leaq 16(%rdx),%rdx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpq $64,%rcx
- jbe .Lcbc_short
testl $32,(%rdx)
jnz .Lcbc_aligned
testq $15,%rdi
@@ -510,6 +551,21 @@ padlock_cbc_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ cmpq %rbx,%rcx
+ ja .Lcbc_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $64,%rax
+ movq $-64,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz .Lcbc_unaligned_tail
jmp .Lcbc_loop
.p2align 4
.Lcbc_loop:
@@ -541,8 +597,8 @@ padlock_cbc_encrypt:
testq $15,%rdi
jz .Lcbc_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
+ shrq $3,%rcx
.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
.Lcbc_out_aligned:
@@ -552,9 +608,26 @@ padlock_cbc_encrypt:
addq %rbx,%rsi
subq %rbx,%rcx
movq $512,%rbx
- jnz .Lcbc_loop
-
+ jz .Lcbc_break
+ cmpq %rbx,%rcx
+ jae .Lcbc_loop
+.Lcbc_unaligned_tail:
+ xorl %eax,%eax
cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp .Lcbc_loop
+.p2align 4
+.Lcbc_break:
+ cmpq %rbp,%rsp
je .Lcbc_done
pxor %xmm0,%xmm0
@@ -568,28 +641,41 @@ padlock_cbc_encrypt:
.Lcbc_done:
leaq (%rbp),%rsp
jmp .Lcbc_exit
-.p2align 4
-.Lcbc_short:
- movq %rsp,%rbp
- subq %rcx,%rsp
- xorq %rbx,%rbx
-.Lcbc_short_copy:
- movups (%rsi,%rbx,1),%xmm0
- leaq 16(%rbx),%rbx
- cmpq %rbx,%rcx
- movaps %xmm0,-16(%rsp,%rbx,1)
- ja .Lcbc_short_copy
- movq %rsp,%rsi
- movq %rcx,%rbx
- jmp .Lcbc_loop
+
.p2align 4
.Lcbc_aligned:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $64,%rbp
+ movq $64-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz .Lcbc_aligned_tail
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
.byte 0xf3,0x0f,0xa7,208
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
+ testq %rbp,%rbp
+ jz .Lcbc_exit
+
+.Lcbc_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp .Lcbc_loop
.Lcbc_exit:
movl $1,%eax
leaq 8(%rsp),%rsp
diff --git a/lib/accelerated/x86/coff/padlock-x86-coff.s
b/lib/accelerated/x86/coff/padlock-x86-coff.s
index b068083..69eb468 100644
--- a/lib/accelerated/x86/coff/padlock-x86-coff.s
+++ b/lib/accelerated/x86/coff/padlock-x86-coff.s
@@ -180,16 +180,14 @@ _padlock_ecb_encrypt:
leal 16(%edx),%edx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpl $128,%ecx
- jbe .L006ecb_short
testl $32,(%edx)
- jnz .L007ecb_aligned
+ jnz .L006ecb_aligned
testl $15,%edi
setz %al
testl $15,%esi
setz %bl
testl %ebx,%eax
- jnz .L007ecb_aligned
+ jnz .L006ecb_aligned
negl %eax
movl $512,%ebx
notl %eax
@@ -201,10 +199,28 @@ _padlock_ecb_encrypt:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
- jmp .L008ecb_loop
+ movl %eax,16(%ebp)
+ cmpl %ebx,%ecx
+ ja .L007ecb_loop
+ movl %esi,%eax
+ cmpl %esp,%ebp
+ cmovel %edi,%eax
+ addl %ecx,%eax
+ negl %eax
+ andl $4095,%eax
+ cmpl $128,%eax
+ movl $-128,%eax
+ cmovael %ebx,%eax
+ andl %eax,%ebx
+ jz .L008ecb_unaligned_tail
+ jmp .L007ecb_loop
.align 16
-.L008ecb_loop:
+.L007ecb_loop:
movl %edi,(%ebp)
movl %esi,4(%ebp)
movl %ecx,8(%ebp)
@@ -229,8 +245,8 @@ _padlock_ecb_encrypt:
testl $15,%edi
jz .L010ecb_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
.L010ecb_out_aligned:
@@ -240,43 +256,75 @@ _padlock_ecb_encrypt:
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz .L008ecb_loop
+ jz .L011ecb_break
+ cmpl %ebx,%ecx
+ jae .L007ecb_loop
+.L008ecb_unaligned_tail:
+ xorl %eax,%eax
+ cmpl %ebp,%esp
+ cmovel %ecx,%eax
+ subl %eax,%esp
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L007ecb_loop
+.align 16
+.L011ecb_break:
cmpl %ebp,%esp
- je .L011ecb_done
+ je .L012ecb_done
pxor %xmm0,%xmm0
leal (%esp),%eax
-.L012ecb_bzero:
+.L013ecb_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja .L012ecb_bzero
-.L011ecb_done:
+ ja .L013ecb_bzero
+.L012ecb_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
- jmp .L013ecb_exit
+ jmp .L014ecb_exit
.align 16
-.L006ecb_short:
+.L006ecb_aligned:
+ leal (%esi,%ecx,1),%ebp
+ negl %ebp
+ andl $4095,%ebp
xorl %eax,%eax
- leal -24(%esp),%ebp
- subl %ecx,%eax
- leal (%eax,%ebp,1),%esp
- andl $-16,%esp
- xorl %ebx,%ebx
-.L014ecb_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
- cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja .L014ecb_short_copy
- movl %esp,%esi
- movl %ecx,%ebx
- jmp .L008ecb_loop
-.align 16
-.L007ecb_aligned:
+ cmpl $128,%ebp
+ movl $127,%ebp
+ cmovael %eax,%ebp
+ andl %ecx,%ebp
+ subl %ebp,%ecx
+ jz .L015ecb_aligned_tail
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
.byte 243,15,167,200
-.L013ecb_exit:
+ testl %ebp,%ebp
+ jz .L014ecb_exit
+.L015ecb_aligned_tail:
+ movl %ebp,%ecx
+ leal -24(%esp),%ebp
+ movl %ebp,%esp
+ movl %ebp,%eax
+ subl %ecx,%esp
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L007ecb_loop
+.L014ecb_exit:
movl $1,%eax
leal 4(%esp),%esp
.L004ecb_abort:
@@ -299,19 +347,17 @@ _padlock_cbc_encrypt:
movl 28(%esp),%edx
movl 32(%esp),%ecx
testl $15,%edx
- jnz .L015cbc_abort
+ jnz .L016cbc_abort
testl $15,%ecx
- jnz .L015cbc_abort
+ jnz .L016cbc_abort
leal .Lpadlock_saved_context,%eax
pushfl
cld
call __padlock_verify_ctx
-.L016cbc_pic_point:
+.L017cbc_pic_point:
leal 16(%edx),%edx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpl $64,%ecx
- jbe .L017cbc_short
testl $32,(%edx)
jnz .L018cbc_aligned
testl $15,%edi
@@ -331,7 +377,25 @@ _padlock_cbc_encrypt:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
+ movl %eax,16(%ebp)
+ cmpl %ebx,%ecx
+ ja .L019cbc_loop
+ movl %esi,%eax
+ cmpl %esp,%ebp
+ cmovel %edi,%eax
+ addl %ecx,%eax
+ negl %eax
+ andl $4095,%eax
+ cmpl $64,%eax
+ movl $-64,%eax
+ cmovael %ebx,%eax
+ andl %eax,%ebx
+ jz .L020cbc_unaligned_tail
jmp .L019cbc_loop
.align 16
.L019cbc_loop:
@@ -343,13 +407,13 @@ _padlock_cbc_encrypt:
testl $15,%edi
cmovnzl %esp,%edi
testl $15,%esi
- jz .L020cbc_inp_aligned
+ jz .L021cbc_inp_aligned
shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
movl %ebx,%ecx
movl %edi,%esi
-.L020cbc_inp_aligned:
+.L021cbc_inp_aligned:
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
@@ -359,61 +423,93 @@ _padlock_cbc_encrypt:
movl (%ebp),%edi
movl 12(%ebp),%ebx
testl $15,%edi
- jz .L021cbc_out_aligned
+ jz .L022cbc_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
-.L021cbc_out_aligned:
+.L022cbc_out_aligned:
movl 4(%ebp),%esi
movl 8(%ebp),%ecx
addl %ebx,%edi
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz .L019cbc_loop
+ jz .L023cbc_break
+ cmpl %ebx,%ecx
+ jae .L019cbc_loop
+.L020cbc_unaligned_tail:
+ xorl %eax,%eax
+ cmpl %ebp,%esp
+ cmovel %ecx,%eax
+ subl %eax,%esp
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L019cbc_loop
+.align 16
+.L023cbc_break:
cmpl %ebp,%esp
- je .L022cbc_done
+ je .L024cbc_done
pxor %xmm0,%xmm0
leal (%esp),%eax
-.L023cbc_bzero:
+.L025cbc_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja .L023cbc_bzero
-.L022cbc_done:
+ ja .L025cbc_bzero
+.L024cbc_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
- jmp .L024cbc_exit
-.align 16
-.L017cbc_short:
- xorl %eax,%eax
- leal -24(%esp),%ebp
- subl %ecx,%eax
- leal (%eax,%ebp,1),%esp
- andl $-16,%esp
- xorl %ebx,%ebx
-.L025cbc_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
- cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja .L025cbc_short_copy
- movl %esp,%esi
- movl %ecx,%ebx
- jmp .L019cbc_loop
+ jmp .L026cbc_exit
.align 16
.L018cbc_aligned:
+ leal (%esi,%ecx,1),%ebp
+ negl %ebp
+ andl $4095,%ebp
+ xorl %eax,%eax
+ cmpl $64,%ebp
+ movl $63,%ebp
+ cmovael %eax,%ebp
+ andl %ecx,%ebp
+ subl %ebp,%ecx
+ jz .L027cbc_aligned_tail
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
.byte 243,15,167,208
movaps (%eax),%xmm0
movaps %xmm0,-16(%edx)
-.L024cbc_exit:
+ testl %ebp,%ebp
+ jz .L026cbc_exit
+.L027cbc_aligned_tail:
+ movl %ebp,%ecx
+ leal -24(%esp),%ebp
+ movl %ebp,%esp
+ movl %ebp,%eax
+ subl %ecx,%esp
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L019cbc_loop
+.L026cbc_exit:
movl $1,%eax
leal 4(%esp),%esp
-.L015cbc_abort:
+.L016cbc_abort:
popl %edi
popl %esi
popl %ebx
@@ -437,10 +533,10 @@ __win32_segv_handler:
movl 4(%esp),%edx
movl 12(%esp),%ecx
cmpl $3221225477,(%edx)
- jne .L026ret
+ jne .L028ret
addl $4,184(%ecx)
movl $0,%eax
-.L026ret:
+.L028ret:
ret
.globl _padlock_sha1_oneshot
.def _padlock_sha1_oneshot; .scl 2; .type 32; .endef
diff --git a/lib/accelerated/x86/elf/padlock-x86-64.s
b/lib/accelerated/x86/elf/padlock-x86-64.s
index bf5f626..4709ac2 100644
--- a/lib/accelerated/x86/elf/padlock-x86-64.s
+++ b/lib/accelerated/x86/elf/padlock-x86-64.s
@@ -276,8 +276,6 @@ padlock_ecb_encrypt:
leaq 16(%rdx),%rdx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpq $128,%rcx
- jbe .Lecb_short
testl $32,(%rdx)
jnz .Lecb_aligned
testq $15,%rdi
@@ -297,6 +295,21 @@ padlock_ecb_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ cmpq %rbx,%rcx
+ ja .Lecb_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $128,%rax
+ movq $-128,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz .Lecb_unaligned_tail
jmp .Lecb_loop
.align 16
.Lecb_loop:
@@ -326,8 +339,8 @@ padlock_ecb_encrypt:
testq $15,%rdi
jz .Lecb_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
+ shrq $3,%rcx
.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
.Lecb_out_aligned:
@@ -337,9 +350,26 @@ padlock_ecb_encrypt:
addq %rbx,%rsi
subq %rbx,%rcx
movq $512,%rbx
- jnz .Lecb_loop
-
+ jz .Lecb_break
+ cmpq %rbx,%rcx
+ jae .Lecb_loop
+.Lecb_unaligned_tail:
+ xorl %eax,%eax
cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp .Lecb_loop
+.align 16
+.Lecb_break:
+ cmpq %rbp,%rsp
je .Lecb_done
pxor %xmm0,%xmm0
@@ -353,26 +383,39 @@ padlock_ecb_encrypt:
.Lecb_done:
leaq (%rbp),%rsp
jmp .Lecb_exit
-.align 16
-.Lecb_short:
- movq %rsp,%rbp
- subq %rcx,%rsp
- xorq %rbx,%rbx
-.Lecb_short_copy:
- movups (%rsi,%rbx,1),%xmm0
- leaq 16(%rbx),%rbx
- cmpq %rbx,%rcx
- movaps %xmm0,-16(%rsp,%rbx,1)
- ja .Lecb_short_copy
- movq %rsp,%rsi
- movq %rcx,%rbx
- jmp .Lecb_loop
+
.align 16
.Lecb_aligned:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $128,%rbp
+ movq $128-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz .Lecb_aligned_tail
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
.byte 0xf3,0x0f,0xa7,200
+ testq %rbp,%rbp
+ jz .Lecb_exit
+
+.Lecb_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp .Lecb_loop
.Lecb_exit:
movl $1,%eax
leaq 8(%rsp),%rsp
@@ -400,8 +443,6 @@ padlock_cbc_encrypt:
leaq 16(%rdx),%rdx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpq $64,%rcx
- jbe .Lcbc_short
testl $32,(%rdx)
jnz .Lcbc_aligned
testq $15,%rdi
@@ -421,6 +462,21 @@ padlock_cbc_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ cmpq %rbx,%rcx
+ ja .Lcbc_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $64,%rax
+ movq $-64,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz .Lcbc_unaligned_tail
jmp .Lcbc_loop
.align 16
.Lcbc_loop:
@@ -452,8 +508,8 @@ padlock_cbc_encrypt:
testq $15,%rdi
jz .Lcbc_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
+ shrq $3,%rcx
.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
.Lcbc_out_aligned:
@@ -463,9 +519,26 @@ padlock_cbc_encrypt:
addq %rbx,%rsi
subq %rbx,%rcx
movq $512,%rbx
- jnz .Lcbc_loop
-
+ jz .Lcbc_break
+ cmpq %rbx,%rcx
+ jae .Lcbc_loop
+.Lcbc_unaligned_tail:
+ xorl %eax,%eax
cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp .Lcbc_loop
+.align 16
+.Lcbc_break:
+ cmpq %rbp,%rsp
je .Lcbc_done
pxor %xmm0,%xmm0
@@ -479,28 +552,41 @@ padlock_cbc_encrypt:
.Lcbc_done:
leaq (%rbp),%rsp
jmp .Lcbc_exit
-.align 16
-.Lcbc_short:
- movq %rsp,%rbp
- subq %rcx,%rsp
- xorq %rbx,%rbx
-.Lcbc_short_copy:
- movups (%rsi,%rbx,1),%xmm0
- leaq 16(%rbx),%rbx
- cmpq %rbx,%rcx
- movaps %xmm0,-16(%rsp,%rbx,1)
- ja .Lcbc_short_copy
- movq %rsp,%rsi
- movq %rcx,%rbx
- jmp .Lcbc_loop
+
.align 16
.Lcbc_aligned:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $64,%rbp
+ movq $64-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz .Lcbc_aligned_tail
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
.byte 0xf3,0x0f,0xa7,208
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
+ testq %rbp,%rbp
+ jz .Lcbc_exit
+
+.Lcbc_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp .Lcbc_loop
.Lcbc_exit:
movl $1,%eax
leaq 8(%rsp),%rsp
diff --git a/lib/accelerated/x86/license.txt b/lib/accelerated/x86/license.txt
index c87ba42..929ddd5 100755
--- a/lib/accelerated/x86/license.txt
+++ b/lib/accelerated/x86/license.txt
@@ -5,7 +5,7 @@ CRYPTOGAMS licenses depending on where you obtain it. For
further
details see http://www.openssl.org/~appro/cryptogams/.
====================================================================
-Copyright (c) 2006, CRYPTOGAMS by <address@hidden>
+Copyright (c) 2006-2012, CRYPTOGAMS by <address@hidden>
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s
b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s
index 9b912f9..dbd89da 100644
--- a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s
+++ b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s
@@ -276,8 +276,6 @@ _padlock_ecb_encrypt:
leaq 16(%rdx),%rdx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpq $128,%rcx
- jbe L$ecb_short
testl $32,(%rdx)
jnz L$ecb_aligned
testq $15,%rdi
@@ -297,6 +295,21 @@ _padlock_ecb_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ cmpq %rbx,%rcx
+ ja L$ecb_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $128,%rax
+ movq $-128,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz L$ecb_unaligned_tail
jmp L$ecb_loop
.p2align 4
L$ecb_loop:
@@ -326,8 +339,8 @@ L$ecb_inp_aligned:
testq $15,%rdi
jz L$ecb_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
+ shrq $3,%rcx
.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
L$ecb_out_aligned:
@@ -337,9 +350,26 @@ L$ecb_out_aligned:
addq %rbx,%rsi
subq %rbx,%rcx
movq $512,%rbx
- jnz L$ecb_loop
-
+ jz L$ecb_break
+ cmpq %rbx,%rcx
+ jae L$ecb_loop
+L$ecb_unaligned_tail:
+ xorl %eax,%eax
cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp L$ecb_loop
+.p2align 4
+L$ecb_break:
+ cmpq %rbp,%rsp
je L$ecb_done
pxor %xmm0,%xmm0
@@ -353,26 +383,39 @@ L$ecb_bzero:
L$ecb_done:
leaq (%rbp),%rsp
jmp L$ecb_exit
-.p2align 4
-L$ecb_short:
- movq %rsp,%rbp
- subq %rcx,%rsp
- xorq %rbx,%rbx
-L$ecb_short_copy:
- movups (%rsi,%rbx,1),%xmm0
- leaq 16(%rbx),%rbx
- cmpq %rbx,%rcx
- movaps %xmm0,-16(%rsp,%rbx,1)
- ja L$ecb_short_copy
- movq %rsp,%rsi
- movq %rcx,%rbx
- jmp L$ecb_loop
+
.p2align 4
L$ecb_aligned:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $128,%rbp
+ movq $128-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz L$ecb_aligned_tail
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
.byte 0xf3,0x0f,0xa7,200
+ testq %rbp,%rbp
+ jz L$ecb_exit
+
+L$ecb_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp L$ecb_loop
L$ecb_exit:
movl $1,%eax
leaq 8(%rsp),%rsp
@@ -400,8 +443,6 @@ _padlock_cbc_encrypt:
leaq 16(%rdx),%rdx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpq $64,%rcx
- jbe L$cbc_short
testl $32,(%rdx)
jnz L$cbc_aligned
testq $15,%rdi
@@ -421,6 +462,21 @@ _padlock_cbc_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ cmpq %rbx,%rcx
+ ja L$cbc_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $64,%rax
+ movq $-64,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz L$cbc_unaligned_tail
jmp L$cbc_loop
.p2align 4
L$cbc_loop:
@@ -452,8 +508,8 @@ L$cbc_inp_aligned:
testq $15,%rdi
jz L$cbc_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
+ shrq $3,%rcx
.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
L$cbc_out_aligned:
@@ -463,9 +519,26 @@ L$cbc_out_aligned:
addq %rbx,%rsi
subq %rbx,%rcx
movq $512,%rbx
- jnz L$cbc_loop
-
+ jz L$cbc_break
+ cmpq %rbx,%rcx
+ jae L$cbc_loop
+L$cbc_unaligned_tail:
+ xorl %eax,%eax
cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp L$cbc_loop
+.p2align 4
+L$cbc_break:
+ cmpq %rbp,%rsp
je L$cbc_done
pxor %xmm0,%xmm0
@@ -479,28 +552,41 @@ L$cbc_bzero:
L$cbc_done:
leaq (%rbp),%rsp
jmp L$cbc_exit
-.p2align 4
-L$cbc_short:
- movq %rsp,%rbp
- subq %rcx,%rsp
- xorq %rbx,%rbx
-L$cbc_short_copy:
- movups (%rsi,%rbx,1),%xmm0
- leaq 16(%rbx),%rbx
- cmpq %rbx,%rcx
- movaps %xmm0,-16(%rsp,%rbx,1)
- ja L$cbc_short_copy
- movq %rsp,%rsi
- movq %rcx,%rbx
- jmp L$cbc_loop
+
.p2align 4
L$cbc_aligned:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $64,%rbp
+ movq $64-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz L$cbc_aligned_tail
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
.byte 0xf3,0x0f,0xa7,208
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
+ testq %rbp,%rbp
+ jz L$cbc_exit
+
+L$cbc_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp L$cbc_loop
L$cbc_exit:
movl $1,%eax
leaq 8(%rsp),%rsp
diff --git a/lib/accelerated/x86/macosx/padlock-x86-macosx.s
b/lib/accelerated/x86/macosx/padlock-x86-macosx.s
index 02b427e..40cfce9 100644
--- a/lib/accelerated/x86/macosx/padlock-x86-macosx.s
+++ b/lib/accelerated/x86/macosx/padlock-x86-macosx.s
@@ -174,16 +174,14 @@ L005ecb_pic_point:
leal 16(%edx),%edx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpl $128,%ecx
- jbe L006ecb_short
testl $32,(%edx)
- jnz L007ecb_aligned
+ jnz L006ecb_aligned
testl $15,%edi
setz %al
testl $15,%esi
setz %bl
testl %ebx,%eax
- jnz L007ecb_aligned
+ jnz L006ecb_aligned
negl %eax
movl $512,%ebx
notl %eax
@@ -195,10 +193,28 @@ L005ecb_pic_point:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
- jmp L008ecb_loop
+ movl %eax,16(%ebp)
+ cmpl %ebx,%ecx
+ ja L007ecb_loop
+ movl %esi,%eax
+ cmpl %esp,%ebp
+ cmovel %edi,%eax
+ addl %ecx,%eax
+ negl %eax
+ andl $4095,%eax
+ cmpl $128,%eax
+ movl $-128,%eax
+ cmovael %ebx,%eax
+ andl %eax,%ebx
+ jz L008ecb_unaligned_tail
+ jmp L007ecb_loop
.align 4,0x90
-L008ecb_loop:
+L007ecb_loop:
movl %edi,(%ebp)
movl %esi,4(%ebp)
movl %ecx,8(%ebp)
@@ -223,8 +239,8 @@ L009ecb_inp_aligned:
testl $15,%edi
jz L010ecb_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
L010ecb_out_aligned:
@@ -234,43 +250,75 @@ L010ecb_out_aligned:
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz L008ecb_loop
+ jz L011ecb_break
+ cmpl %ebx,%ecx
+ jae L007ecb_loop
+L008ecb_unaligned_tail:
+ xorl %eax,%eax
+ cmpl %ebp,%esp
+ cmovel %ecx,%eax
+ subl %eax,%esp
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp L007ecb_loop
+.align 4,0x90
+L011ecb_break:
cmpl %ebp,%esp
- je L011ecb_done
+ je L012ecb_done
pxor %xmm0,%xmm0
leal (%esp),%eax
-L012ecb_bzero:
+L013ecb_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja L012ecb_bzero
-L011ecb_done:
+ ja L013ecb_bzero
+L012ecb_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
- jmp L013ecb_exit
+ jmp L014ecb_exit
.align 4,0x90
-L006ecb_short:
+L006ecb_aligned:
+ leal (%esi,%ecx,1),%ebp
+ negl %ebp
+ andl $4095,%ebp
xorl %eax,%eax
- leal -24(%esp),%ebp
- subl %ecx,%eax
- leal (%eax,%ebp,1),%esp
- andl $-16,%esp
- xorl %ebx,%ebx
-L014ecb_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
- cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja L014ecb_short_copy
- movl %esp,%esi
- movl %ecx,%ebx
- jmp L008ecb_loop
-.align 4,0x90
-L007ecb_aligned:
+ cmpl $128,%ebp
+ movl $127,%ebp
+ cmovael %eax,%ebp
+ andl %ecx,%ebp
+ subl %ebp,%ecx
+ jz L015ecb_aligned_tail
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
.byte 243,15,167,200
-L013ecb_exit:
+ testl %ebp,%ebp
+ jz L014ecb_exit
+L015ecb_aligned_tail:
+ movl %ebp,%ecx
+ leal -24(%esp),%ebp
+ movl %ebp,%esp
+ movl %ebp,%eax
+ subl %ecx,%esp
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp L007ecb_loop
+L014ecb_exit:
movl $1,%eax
leal 4(%esp),%esp
L004ecb_abort:
@@ -292,19 +340,17 @@ L_padlock_cbc_encrypt_begin:
movl 28(%esp),%edx
movl 32(%esp),%ecx
testl $15,%edx
- jnz L015cbc_abort
+ jnz L016cbc_abort
testl $15,%ecx
- jnz L015cbc_abort
- leal Lpadlock_saved_context-L016cbc_pic_point,%eax
+ jnz L016cbc_abort
+ leal Lpadlock_saved_context-L017cbc_pic_point,%eax
pushfl
cld
call __padlock_verify_ctx
-L016cbc_pic_point:
+L017cbc_pic_point:
leal 16(%edx),%edx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpl $64,%ecx
- jbe L017cbc_short
testl $32,(%edx)
jnz L018cbc_aligned
testl $15,%edi
@@ -324,7 +370,25 @@ L016cbc_pic_point:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
+ movl %eax,16(%ebp)
+ cmpl %ebx,%ecx
+ ja L019cbc_loop
+ movl %esi,%eax
+ cmpl %esp,%ebp
+ cmovel %edi,%eax
+ addl %ecx,%eax
+ negl %eax
+ andl $4095,%eax
+ cmpl $64,%eax
+ movl $-64,%eax
+ cmovael %ebx,%eax
+ andl %eax,%ebx
+ jz L020cbc_unaligned_tail
jmp L019cbc_loop
.align 4,0x90
L019cbc_loop:
@@ -336,13 +400,13 @@ L019cbc_loop:
testl $15,%edi
cmovnzl %esp,%edi
testl $15,%esi
- jz L020cbc_inp_aligned
+ jz L021cbc_inp_aligned
shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
movl %ebx,%ecx
movl %edi,%esi
-L020cbc_inp_aligned:
+L021cbc_inp_aligned:
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
@@ -352,61 +416,93 @@ L020cbc_inp_aligned:
movl (%ebp),%edi
movl 12(%ebp),%ebx
testl $15,%edi
- jz L021cbc_out_aligned
+ jz L022cbc_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
-L021cbc_out_aligned:
+L022cbc_out_aligned:
movl 4(%ebp),%esi
movl 8(%ebp),%ecx
addl %ebx,%edi
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz L019cbc_loop
+ jz L023cbc_break
+ cmpl %ebx,%ecx
+ jae L019cbc_loop
+L020cbc_unaligned_tail:
+ xorl %eax,%eax
+ cmpl %ebp,%esp
+ cmovel %ecx,%eax
+ subl %eax,%esp
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp L019cbc_loop
+.align 4,0x90
+L023cbc_break:
cmpl %ebp,%esp
- je L022cbc_done
+ je L024cbc_done
pxor %xmm0,%xmm0
leal (%esp),%eax
-L023cbc_bzero:
+L025cbc_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja L023cbc_bzero
-L022cbc_done:
+ ja L025cbc_bzero
+L024cbc_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
- jmp L024cbc_exit
-.align 4,0x90
-L017cbc_short:
- xorl %eax,%eax
- leal -24(%esp),%ebp
- subl %ecx,%eax
- leal (%eax,%ebp,1),%esp
- andl $-16,%esp
- xorl %ebx,%ebx
-L025cbc_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
- cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja L025cbc_short_copy
- movl %esp,%esi
- movl %ecx,%ebx
- jmp L019cbc_loop
+ jmp L026cbc_exit
.align 4,0x90
L018cbc_aligned:
+ leal (%esi,%ecx,1),%ebp
+ negl %ebp
+ andl $4095,%ebp
+ xorl %eax,%eax
+ cmpl $64,%ebp
+ movl $63,%ebp
+ cmovael %eax,%ebp
+ andl %ecx,%ebp
+ subl %ebp,%ecx
+ jz L027cbc_aligned_tail
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
.byte 243,15,167,208
movaps (%eax),%xmm0
movaps %xmm0,-16(%edx)
-L024cbc_exit:
+ testl %ebp,%ebp
+ jz L026cbc_exit
+L027cbc_aligned_tail:
+ movl %ebp,%ecx
+ leal -24(%esp),%ebp
+ movl %ebp,%esp
+ movl %ebp,%eax
+ subl %ecx,%esp
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp L019cbc_loop
+L026cbc_exit:
movl $1,%eax
leal 4(%esp),%esp
-L015cbc_abort:
+L016cbc_abort:
popl %edi
popl %esi
popl %ebx
@@ -428,10 +524,10 @@ __win32_segv_handler:
movl 4(%esp),%edx
movl 12(%esp),%ecx
cmpl $3221225477,(%edx)
- jne L026ret
+ jne L028ret
addl $4,184(%ecx)
movl $0,%eax
-L026ret:
+L028ret:
ret
.globl _padlock_sha1_oneshot
.align 4
hooks/post-receive
--
GNU gnutls
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [SCM] GNU gnutls branch, master, updated. gnutls_3_0_17-14-g9567d93,
Nikos Mavrogiannopoulos <=