gnutls-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[SCM] GNU gnutls branch, master, updated. gnutls_3_0_8-5-g06010f7


From: Nikos Mavrogiannopoulos
Subject: [SCM] GNU gnutls branch, master, updated. gnutls_3_0_8-5-g06010f7
Date: Sun, 13 Nov 2011 15:08:27 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU gnutls".

http://git.savannah.gnu.org/cgit/gnutls.git/commit/?id=06010f7310003259e617ada2a7275900553b9e99

The branch, master has been updated
       via  06010f7310003259e617ada2a7275900553b9e99 (commit)
       via  caad8f49b25ad435d3d059bed12dfc5d381fb34f (commit)
       via  0e0d7e70a310a864c10e69c1d416e5f290b05285 (commit)
      from  da0244850b22bbe9cfee451c6cbe741d7552a5c7 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 06010f7310003259e617ada2a7275900553b9e99
Author: Nikos Mavrogiannopoulos <address@hidden>
Date:   Sun Nov 13 16:08:49 2011 +0100

    Commited new assembler files.

commit caad8f49b25ad435d3d059bed12dfc5d381fb34f
Author: Nikos Mavrogiannopoulos <address@hidden>
Date:   Sun Nov 13 15:25:06 2011 +0100

    Added rules to auto-generate the assembler files.

commit 0e0d7e70a310a864c10e69c1d416e5f290b05285
Author: Nikos Mavrogiannopoulos <address@hidden>
Date:   Sat Nov 12 22:54:36 2011 +0100

    more files to ignore

-----------------------------------------------------------------------

Summary of changes:
 .gitignore                                        |    5 +-
 cfg.mk                                            |   91 +
 devel/perlasm/aesni-x86.pl                        | 2189 +++++++++++++++
 devel/perlasm/aesni-x86_64.pl                     | 3068 +++++++++++++++++++++
 devel/perlasm/cbc.pl                              |  349 +++
 devel/perlasm/cpuid-x86.pl                        |   57 +
 devel/perlasm/cpuid-x86_64.pl                     |   69 +
 devel/perlasm/e_padlock-x86.pl                    |  548 ++++
 devel/perlasm/e_padlock-x86_64.pl                 |  498 ++++
 devel/perlasm/ghash-x86.pl                        | 1342 +++++++++
 devel/perlasm/ghash-x86_64.pl                     |  805 ++++++
 devel/perlasm/license-gnutls.txt                  |   20 +
 devel/perlasm/license.txt                         |   37 +
 devel/perlasm/ppc-xlate.pl                        |  159 ++
 devel/perlasm/readme                              |  124 +
 devel/perlasm/x86_64-xlate.pl                     | 1083 ++++++++
 devel/perlasm/x86asm.pl                           |  260 ++
 devel/perlasm/x86gas.pl                           |  255 ++
 devel/perlasm/x86masm.pl                          |  196 ++
 devel/perlasm/x86nasm.pl                          |  177 ++
 lib/accelerated/x86/asm-coff/appro-aes-x86-coff.s |    2 +-
 lib/accelerated/x86/asm-coff/cpuid-x86-64-coff.s  |   60 +-
 lib/accelerated/x86/asm-coff/cpuid-x86-coff.s     |   78 +-
 lib/accelerated/x86/asm-coff/padlock-x86-coff.s   |    2 +-
 lib/accelerated/x86/asm/appro-aes-gcm-x86-64.s    |    3 -
 lib/accelerated/x86/asm/appro-aes-x86-64.s        |   10 +-
 lib/accelerated/x86/asm/appro-aes-x86.s           |   14 +-
 lib/accelerated/x86/asm/cpuid-x86-64.s            |   62 +-
 lib/accelerated/x86/asm/cpuid-x86.s               |   89 +-
 lib/accelerated/x86/asm/padlock-x86-64.s          |    2 -
 lib/accelerated/x86/asm/padlock-x86.s             |    4 +-
 31 files changed, 11475 insertions(+), 183 deletions(-)
 create mode 100644 devel/perlasm/aesni-x86.pl
 create mode 100644 devel/perlasm/aesni-x86_64.pl
 create mode 100644 devel/perlasm/cbc.pl
 create mode 100644 devel/perlasm/cpuid-x86.pl
 create mode 100644 devel/perlasm/cpuid-x86_64.pl
 create mode 100644 devel/perlasm/e_padlock-x86.pl
 create mode 100644 devel/perlasm/e_padlock-x86_64.pl
 create mode 100644 devel/perlasm/ghash-x86.pl
 create mode 100644 devel/perlasm/ghash-x86_64.pl
 create mode 100644 devel/perlasm/license-gnutls.txt
 create mode 100644 devel/perlasm/license.txt
 create mode 100755 devel/perlasm/ppc-xlate.pl
 create mode 100644 devel/perlasm/readme
 create mode 100755 devel/perlasm/x86_64-xlate.pl
 create mode 100644 devel/perlasm/x86asm.pl
 create mode 100644 devel/perlasm/x86gas.pl
 create mode 100644 devel/perlasm/x86masm.pl
 create mode 100644 devel/perlasm/x86nasm.pl

diff --git a/.gitignore b/.gitignore
index 86d0b4c..92174c1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -462,7 +462,6 @@ tests/dn
 tests/dn2
 tests/finished
 tests/gc
-tests/gendh
 tests/hostname-check
 tests/infoaccess
 tests/init_roundtrip
@@ -529,4 +528,6 @@ tests/x509paths/
 tests/x509self
 tests/x509sign-verify
 tests/x509signself
-tests/keygen
+tests/slow/keygen
+tests/slow/gendh
+doc/reference/*.bak
diff --git a/cfg.mk b/cfg.mk
index 156296a..6849ed5 100644
--- a/cfg.mk
+++ b/cfg.mk
@@ -131,3 +131,94 @@ upload-web:
        cd $(htmldir) && \
                cvs commit -m "Update." manual/ reference/ \
                        doxygen/ devel/ cyclo/
+
+ASM_SOURCES:= lib/accelerated/x86/asm-coff/cpuid-x86-64-coff.s \
+       lib/accelerated/x86/asm/cpuid-x86-64.s \
+       lib/accelerated/x86/asm-coff/cpuid-x86-coff.s \
+       lib/accelerated/x86/asm/cpuid-x86.s \
+       lib/accelerated/x86/asm/appro-aes-gcm-x86-64.s \
+       lib/accelerated/x86/asm/appro-aes-x86-64.s \
+       lib/accelerated/x86/asm/appro-aes-x86.s \
+       lib/accelerated/x86/asm/padlock-x86-64.s \
+       lib/accelerated/x86/asm/padlock-x86.s \
+       lib/accelerated/x86/asm-coff/appro-aes-gcm-x86-64-coff.s \
+       lib/accelerated/x86/asm-coff/appro-aes-x86-64-coff.s \
+       lib/accelerated/x86/asm-coff/appro-aes-x86-coff.s \
+       lib/accelerated/x86/asm-coff/padlock-x86-64-coff.s \
+       lib/accelerated/x86/asm-coff/padlock-x86-coff.s
+
+asm-sources: $(ASM_SOURCES)
+
+asm-sources-clean:
+       rm -f $(ASM_SOURCES)
+
+lib/accelerated/x86/asm/cpuid-x86-64.s: devel/perlasm/cpuid-x86_64.pl
+       cat devel/perlasm/license-gnutls.txt > $@
+       perl $< elf >> $@
+       echo "" >> $@
+       echo ".section .note.GNU-stack,\"\",%progbits" >> $@
+
+
+lib/accelerated/x86/asm/cpuid-x86.s: devel/perlasm/cpuid-x86.pl
+       cat devel/perlasm/license-gnutls.txt > $@
+       perl $< elf >> $@
+       echo "" >> $@
+       echo ".section .note.GNU-stack,\"\",%progbits" >> $@
+
+lib/accelerated/x86/asm/appro-aes-gcm-x86-64.s: devel/perlasm/ghash-x86_64.pl
+       cat devel/perlasm/license.txt > $@
+       perl $< elf >> $@
+       echo "" >> $@
+       echo ".section .note.GNU-stack,\"\",%progbits" >> $@
+
+lib/accelerated/x86/asm/appro-aes-x86-64.s: devel/perlasm/aesni-x86_64.pl
+       cat devel/perlasm/license.txt > $@
+       perl $< elf >> $@
+       echo "" >> $@
+       echo ".section .note.GNU-stack,\"\",%progbits" >> $@
+
+lib/accelerated/x86/asm/appro-aes-x86.s: devel/perlasm/aesni-x86.pl
+       cat devel/perlasm/license.txt > $@
+       perl $< elf >> $@
+       echo "" >> $@
+       echo ".section .note.GNU-stack,\"\",%progbits" >> $@
+
+lib/accelerated/x86/asm/padlock-x86-64.s: devel/perlasm/e_padlock-x86_64.pl
+       cat devel/perlasm/license.txt > $@
+       perl $< elf >> $@
+       echo "" >> $@
+       echo ".section .note.GNU-stack,\"\",%progbits" >> $@
+
+lib/accelerated/x86/asm/padlock-x86.s: devel/perlasm/e_padlock-x86.pl
+       cat devel/perlasm/license.txt > $@
+       perl $< elf >> $@
+       echo "" >> $@
+       echo ".section .note.GNU-stack,\"\",%progbits" >> $@
+
+lib/accelerated/x86/asm-coff/appro-aes-gcm-x86-64-coff.s: 
devel/perlasm/ghash-x86_64.pl
+       cat devel/perlasm/license.txt > $@
+       perl $< mingw64 >> $@
+
+lib/accelerated/x86/asm-coff/appro-aes-x86-64-coff.s: 
devel/perlasm/aesni-x86_64.pl
+       cat devel/perlasm/license.txt > $@
+       perl $< mingw64 >> $@
+
+lib/accelerated/x86/asm-coff/appro-aes-x86-coff.s: devel/perlasm/aesni-x86.pl
+       cat devel/perlasm/license.txt > $@
+       perl $< coff >> $@
+
+lib/accelerated/x86/asm-coff/padlock-x86-64-coff.s: 
devel/perlasm/e_padlock-x86_64.pl
+       cat devel/perlasm/license.txt > $@
+       perl $< mingw64 >> $@
+
+lib/accelerated/x86/asm-coff/padlock-x86-coff.s: devel/perlasm/e_padlock-x86.pl
+       cat devel/perlasm/license.txt > $@
+       perl $< coff >> $@
+
+lib/accelerated/x86/asm-coff/cpuid-x86-64-coff.s: devel/perlasm/cpuid-x86_64.pl
+       cat devel/perlasm/license-gnutls.txt > $@
+       perl $< mingw64 >> $@
+
+lib/accelerated/x86/asm-coff/cpuid-x86-coff.s: devel/perlasm/cpuid-x86.pl
+       cat devel/perlasm/license-gnutls.txt > $@
+       perl $< coff >> $@
diff --git a/devel/perlasm/aesni-x86.pl b/devel/perlasm/aesni-x86.pl
new file mode 100644
index 0000000..3dc345b
--- /dev/null
+++ b/devel/perlasm/aesni-x86.pl
@@ -0,0 +1,2189 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <address@hidden> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
+# details].
+#
+# Performance.
+#
+# To start with see corresponding paragraph in aesni-x86_64.pl...
+# Instead of filling table similar to one found there I've chosen to
+# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
+# The simplified table below represents 32-bit performance relative
+# to 64-bit one in every given point. Ratios vary for different
+# encryption modes, therefore interval values.
+#
+#      16-byte     64-byte     256-byte    1-KB        8-KB
+#      53-67%      67-84%      91-94%      95-98%      97-99.5%
+#
+# Lower ratios for smaller block sizes are perfectly understandable,
+# because function call overhead is higher in 32-bit mode. Largest
+# 8-KB block performance is virtually same: 32-bit code is less than
+# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
+
+# January 2011
+#
+# See aesni-x86_64.pl for details. Unlike x86_64 version this module
+# interleaves at most 6 aes[enc|dec] instructions, because there are
+# not enough registers for 8x interleave [which should be optimal for
+# Sandy Bridge]. Actually, performance results for 6x interleave
+# factor presented in aesni-x86_64.pl (except for CTR) are for this
+# module.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
+
+$PREFIX="aesni";       # if $PREFIX is set to "AES", the script
+                       # generates drop-in replacement for
+                       # crypto/aes/asm/aes-586.pl:-)
+$inline=1;             # inline _aesni_[en|de]crypt
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+if ($PREFIX eq "aesni")        { $movekey=*movups; }
+else                   { $movekey=*movups; }
+
+$len="eax";
+$rounds="ecx";
+$key="edx";
+$inp="esi";
+$out="edi";
+$rounds_="ebx";        # backup copy for $rounds
+$key_="ebp";   # backup copy for $key
+
+$rndkey0="xmm0";
+$rndkey1="xmm1";
+$inout0="xmm2";
+$inout1="xmm3";
+$inout2="xmm4";
+$inout3="xmm5";        $in1="xmm5";
+$inout4="xmm6";        $in0="xmm6";
+$inout5="xmm7";        $ivec="xmm7";
+
+# AESNI extenstion
+sub aeskeygenassist
+{ my($dst,$src,$imm)address@hidden;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {  &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);   }
+}
+sub aescommon
+{ my($opcodelet,$dst,$src)address@hidden;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {  &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
+}
+sub aesimc     { aescommon(0xdb,@_); }
+sub aesenc     { aescommon(0xdc,@_); }
+sub aesenclast { aescommon(0xdd,@_); }
+sub aesdec     { aescommon(0xde,@_); }
+sub aesdeclast { aescommon(0xdf,@_); }
+
+# Inline version of internal aesni_[en|de]crypt1
+{ my $sn;
+sub aesni_inline_generate1
+{ my ($p,$inout,$ivec)address@hidden; $inout=$inout0 if (!defined($inout));
+  $sn++;
+
+    &$movekey          ($rndkey0,&QWP(0,$key));
+    &$movekey          ($rndkey1,&QWP(16,$key));
+    &xorps             ($ivec,$rndkey0)        if (defined($ivec));
+    &lea               ($key,&DWP(32,$key));
+    &xorps             ($inout,$ivec)          if (defined($ivec));
+    &xorps             ($inout,$rndkey0)       if (!defined($ivec));
+    &set_label("${p}1_loop_$sn");
+       eval"&aes${p}   ($inout,$rndkey1)";
+       &dec            ($rounds);
+       &$movekey       ($rndkey1,&QWP(0,$key));
+       &lea            ($key,&DWP(16,$key));
+    &jnz               (&label("${p}1_loop_$sn"));
+    eval"&aes${p}last  ($inout,$rndkey1)";
+}}
+
+sub aesni_generate1    # fully unrolled loop
+{ my ($p,$inout)address@hidden; $inout=$inout0 if (!defined($inout));
+
+    &function_begin_B("_aesni_${p}rypt1");
+       &movups         ($rndkey0,&QWP(0,$key));
+       &$movekey       ($rndkey1,&QWP(0x10,$key));
+       &xorps          ($inout,$rndkey0);
+       &$movekey       ($rndkey0,&QWP(0x20,$key));
+       &lea            ($key,&DWP(0x30,$key));
+       &cmp            ($rounds,11);
+       &jb             (&label("${p}128"));
+       &lea            ($key,&DWP(0x20,$key));
+       &je             (&label("${p}192"));
+       &lea            ($key,&DWP(0x20,$key));
+       eval"&aes${p}   ($inout,$rndkey1)";
+       &$movekey       ($rndkey1,&QWP(-0x40,$key));
+       eval"&aes${p}   ($inout,$rndkey0)";
+       &$movekey       ($rndkey0,&QWP(-0x30,$key));
+    &set_label("${p}192");
+       eval"&aes${p}   ($inout,$rndkey1)";
+       &$movekey       ($rndkey1,&QWP(-0x20,$key));
+       eval"&aes${p}   ($inout,$rndkey0)";
+       &$movekey       ($rndkey0,&QWP(-0x10,$key));
+    &set_label("${p}128");
+       eval"&aes${p}   ($inout,$rndkey1)";
+       &$movekey       ($rndkey1,&QWP(0,$key));
+       eval"&aes${p}   ($inout,$rndkey0)";
+       &$movekey       ($rndkey0,&QWP(0x10,$key));
+       eval"&aes${p}   ($inout,$rndkey1)";
+       &$movekey       ($rndkey1,&QWP(0x20,$key));
+       eval"&aes${p}   ($inout,$rndkey0)";
+       &$movekey       ($rndkey0,&QWP(0x30,$key));
+       eval"&aes${p}   ($inout,$rndkey1)";
+       &$movekey       ($rndkey1,&QWP(0x40,$key));
+       eval"&aes${p}   ($inout,$rndkey0)";
+       &$movekey       ($rndkey0,&QWP(0x50,$key));
+       eval"&aes${p}   ($inout,$rndkey1)";
+       &$movekey       ($rndkey1,&QWP(0x60,$key));
+       eval"&aes${p}   ($inout,$rndkey0)";
+       &$movekey       ($rndkey0,&QWP(0x70,$key));
+       eval"&aes${p}   ($inout,$rndkey1)";
+    eval"&aes${p}last  ($inout,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt1");
+}
+
+# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
+&aesni_generate1("enc") if (!$inline);
+&function_begin_B("${PREFIX}_encrypt");
+       &mov    ("eax",&wparam(0));
+       &mov    ($key,&wparam(2));
+       &movups ($inout0,&QWP(0,"eax"));
+       &mov    ($rounds,&DWP(240,$key));
+       &mov    ("eax",&wparam(1));
+       if ($inline)
+       {   &aesni_inline_generate1("enc");     }
+       else
+       {   &call       ("_aesni_encrypt1");    }
+       &movups (&QWP(0,"eax"),$inout0);
+       &ret    ();
+&function_end_B("${PREFIX}_encrypt");
+
+# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
+&aesni_generate1("dec") if(!$inline);
+&function_begin_B("${PREFIX}_decrypt");
+       &mov    ("eax",&wparam(0));
+       &mov    ($key,&wparam(2));
+       &movups ($inout0,&QWP(0,"eax"));
+       &mov    ($rounds,&DWP(240,$key));
+       &mov    ("eax",&wparam(1));
+       if ($inline)
+       {   &aesni_inline_generate1("dec");     }
+       else
+       {   &call       ("_aesni_decrypt1");    }
+       &movups (&QWP(0,"eax"),$inout0);
+       &ret    ();
+&function_end_B("${PREFIX}_decrypt");
+
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it makes no sense to implement 2x subroutine.
+# aes[enc|dec] latency in next processor generation is 8, but the
+# instructions can be scheduled every cycle. Optimal interleave for
+# new processor is therefore 8x, but it's unfeasible to accommodate it
+# in XMM registers addreassable in 32-bit mode and therefore 6x is
+# used instead...
+
+sub aesni_generate3
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt3");
+       &$movekey       ($rndkey0,&QWP(0,$key));
+       &shr            ($rounds,1);
+       &$movekey       ($rndkey1,&QWP(16,$key));
+       &lea            ($key,&DWP(32,$key));
+       &xorps          ($inout0,$rndkey0);
+       &pxor           ($inout1,$rndkey0);
+       &pxor           ($inout2,$rndkey0);
+       &$movekey       ($rndkey0,&QWP(0,$key));
+
+    &set_label("${p}3_loop");
+       eval"&aes${p}   ($inout0,$rndkey1)";
+       eval"&aes${p}   ($inout1,$rndkey1)";
+       &dec            ($rounds);
+       eval"&aes${p}   ($inout2,$rndkey1)";
+       &$movekey       ($rndkey1,&QWP(16,$key));
+       eval"&aes${p}   ($inout0,$rndkey0)";
+       eval"&aes${p}   ($inout1,$rndkey0)";
+       &lea            ($key,&DWP(32,$key));
+       eval"&aes${p}   ($inout2,$rndkey0)";
+       &$movekey       ($rndkey0,&QWP(0,$key));
+       &jnz            (&label("${p}3_loop"));
+    eval"&aes${p}      ($inout0,$rndkey1)";
+    eval"&aes${p}      ($inout1,$rndkey1)";
+    eval"&aes${p}      ($inout2,$rndkey1)";
+    eval"&aes${p}last  ($inout0,$rndkey0)";
+    eval"&aes${p}last  ($inout1,$rndkey0)";
+    eval"&aes${p}last  ($inout2,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt3");
+}
+
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement  would be <20%,
+# so it's not worth it...
+sub aesni_generate4
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt4");
+       &$movekey       ($rndkey0,&QWP(0,$key));
+       &$movekey       ($rndkey1,&QWP(16,$key));
+       &shr            ($rounds,1);
+       &lea            ($key,&DWP(32,$key));
+       &xorps          ($inout0,$rndkey0);
+       &pxor           ($inout1,$rndkey0);
+       &pxor           ($inout2,$rndkey0);
+       &pxor           ($inout3,$rndkey0);
+       &$movekey       ($rndkey0,&QWP(0,$key));
+
+    &set_label("${p}4_loop");
+       eval"&aes${p}   ($inout0,$rndkey1)";
+       eval"&aes${p}   ($inout1,$rndkey1)";
+       &dec            ($rounds);
+       eval"&aes${p}   ($inout2,$rndkey1)";
+       eval"&aes${p}   ($inout3,$rndkey1)";
+       &$movekey       ($rndkey1,&QWP(16,$key));
+       eval"&aes${p}   ($inout0,$rndkey0)";
+       eval"&aes${p}   ($inout1,$rndkey0)";
+       &lea            ($key,&DWP(32,$key));
+       eval"&aes${p}   ($inout2,$rndkey0)";
+       eval"&aes${p}   ($inout3,$rndkey0)";
+       &$movekey       ($rndkey0,&QWP(0,$key));
+    &jnz               (&label("${p}4_loop"));
+
+    eval"&aes${p}      ($inout0,$rndkey1)";
+    eval"&aes${p}      ($inout1,$rndkey1)";
+    eval"&aes${p}      ($inout2,$rndkey1)";
+    eval"&aes${p}      ($inout3,$rndkey1)";
+    eval"&aes${p}last  ($inout0,$rndkey0)";
+    eval"&aes${p}last  ($inout1,$rndkey0)";
+    eval"&aes${p}last  ($inout2,$rndkey0)";
+    eval"&aes${p}last  ($inout3,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt4");
+}
+
+sub aesni_generate6
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt6");
+    &static_label("_aesni_${p}rypt6_enter");
+       &$movekey       ($rndkey0,&QWP(0,$key));
+       &shr            ($rounds,1);
+       &$movekey       ($rndkey1,&QWP(16,$key));
+       &lea            ($key,&DWP(32,$key));
+       &xorps          ($inout0,$rndkey0);
+       &pxor           ($inout1,$rndkey0);     # pxor does better here
+       eval"&aes${p}   ($inout0,$rndkey1)";
+       &pxor           ($inout2,$rndkey0);
+       eval"&aes${p}   ($inout1,$rndkey1)";
+       &pxor           ($inout3,$rndkey0);
+       &dec            ($rounds);
+       eval"&aes${p}   ($inout2,$rndkey1)";
+       &pxor           ($inout4,$rndkey0);
+       eval"&aes${p}   ($inout3,$rndkey1)";
+       &pxor           ($inout5,$rndkey0);
+       eval"&aes${p}   ($inout4,$rndkey1)";
+       &$movekey       ($rndkey0,&QWP(0,$key));
+       eval"&aes${p}   ($inout5,$rndkey1)";
+       &jmp            (&label("_aesni_${p}rypt6_enter"));
+
+    &set_label("${p}6_loop",16);
+       eval"&aes${p}   ($inout0,$rndkey1)";
+       eval"&aes${p}   ($inout1,$rndkey1)";
+       &dec            ($rounds);
+       eval"&aes${p}   ($inout2,$rndkey1)";
+       eval"&aes${p}   ($inout3,$rndkey1)";
+       eval"&aes${p}   ($inout4,$rndkey1)";
+       eval"&aes${p}   ($inout5,$rndkey1)";
+    &set_label("_aesni_${p}rypt6_enter",16);
+       &$movekey       ($rndkey1,&QWP(16,$key));
+       eval"&aes${p}   ($inout0,$rndkey0)";
+       eval"&aes${p}   ($inout1,$rndkey0)";
+       &lea            ($key,&DWP(32,$key));
+       eval"&aes${p}   ($inout2,$rndkey0)";
+       eval"&aes${p}   ($inout3,$rndkey0)";
+       eval"&aes${p}   ($inout4,$rndkey0)";
+       eval"&aes${p}   ($inout5,$rndkey0)";
+       &$movekey       ($rndkey0,&QWP(0,$key));
+    &jnz               (&label("${p}6_loop"));
+
+    eval"&aes${p}      ($inout0,$rndkey1)";
+    eval"&aes${p}      ($inout1,$rndkey1)";
+    eval"&aes${p}      ($inout2,$rndkey1)";
+    eval"&aes${p}      ($inout3,$rndkey1)";
+    eval"&aes${p}      ($inout4,$rndkey1)";
+    eval"&aes${p}      ($inout5,$rndkey1)";
+    eval"&aes${p}last  ($inout0,$rndkey0)";
+    eval"&aes${p}last  ($inout1,$rndkey0)";
+    eval"&aes${p}last  ($inout2,$rndkey0)";
+    eval"&aes${p}last  ($inout3,$rndkey0)";
+    eval"&aes${p}last  ($inout4,$rndkey0)";
+    eval"&aes${p}last  ($inout5,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt6");
+}
+&aesni_generate3("enc") if ($PREFIX eq "aesni");
+&aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
+
+if ($PREFIX eq "aesni") {
+######################################################################
+# void aesni_ecb_encrypt (const void *in, void *out,
+#                         size_t length, const AES_KEY *key,
+#                         int enc);
+&function_begin("aesni_ecb_encrypt");
+       &mov    ($inp,&wparam(0));
+       &mov    ($out,&wparam(1));
+       &mov    ($len,&wparam(2));
+       &mov    ($key,&wparam(3));
+       &mov    ($rounds_,&wparam(4));
+       &and    ($len,-16);
+       &jz     (&label("ecb_ret"));
+       &mov    ($rounds,&DWP(240,$key));
+       &test   ($rounds_,$rounds_);
+       &jz     (&label("ecb_decrypt"));
+
+       &mov    ($key_,$key);           # backup $key
+       &mov    ($rounds_,$rounds);     # backup $rounds
+       &cmp    ($len,0x60);
+       &jb     (&label("ecb_enc_tail"));
+
+       &movdqu ($inout0,&QWP(0,$inp));
+       &movdqu ($inout1,&QWP(0x10,$inp));
+       &movdqu ($inout2,&QWP(0x20,$inp));
+       &movdqu ($inout3,&QWP(0x30,$inp));
+       &movdqu ($inout4,&QWP(0x40,$inp));
+       &movdqu ($inout5,&QWP(0x50,$inp));
+       &lea    ($inp,&DWP(0x60,$inp));
+       &sub    ($len,0x60);
+       &jmp    (&label("ecb_enc_loop6_enter"));
+
+&set_label("ecb_enc_loop6",16);
+       &movups (&QWP(0,$out),$inout0);
+       &movdqu ($inout0,&QWP(0,$inp));
+       &movups (&QWP(0x10,$out),$inout1);
+       &movdqu ($inout1,&QWP(0x10,$inp));
+       &movups (&QWP(0x20,$out),$inout2);
+       &movdqu ($inout2,&QWP(0x20,$inp));
+       &movups (&QWP(0x30,$out),$inout3);
+       &movdqu ($inout3,&QWP(0x30,$inp));
+       &movups (&QWP(0x40,$out),$inout4);
+       &movdqu ($inout4,&QWP(0x40,$inp));
+       &movups (&QWP(0x50,$out),$inout5);
+       &lea    ($out,&DWP(0x60,$out));
+       &movdqu ($inout5,&QWP(0x50,$inp));
+       &lea    ($inp,&DWP(0x60,$inp));
+&set_label("ecb_enc_loop6_enter");
+
+       &call   ("_aesni_encrypt6");
+
+       &mov    ($key,$key_);           # restore $key
+       &mov    ($rounds,$rounds_);     # restore $rounds
+       &sub    ($len,0x60);
+       &jnc    (&label("ecb_enc_loop6"));
+
+       &movups (&QWP(0,$out),$inout0);
+       &movups (&QWP(0x10,$out),$inout1);
+       &movups (&QWP(0x20,$out),$inout2);
+       &movups (&QWP(0x30,$out),$inout3);
+       &movups (&QWP(0x40,$out),$inout4);
+       &movups (&QWP(0x50,$out),$inout5);
+       &lea    ($out,&DWP(0x60,$out));
+       &add    ($len,0x60);
+       &jz     (&label("ecb_ret"));
+
+&set_label("ecb_enc_tail");
+       &movups ($inout0,&QWP(0,$inp));
+       &cmp    ($len,0x20);
+       &jb     (&label("ecb_enc_one"));
+       &movups ($inout1,&QWP(0x10,$inp));
+       &je     (&label("ecb_enc_two"));
+       &movups ($inout2,&QWP(0x20,$inp));
+       &cmp    ($len,0x40);
+       &jb     (&label("ecb_enc_three"));
+       &movups ($inout3,&QWP(0x30,$inp));
+       &je     (&label("ecb_enc_four"));
+       &movups ($inout4,&QWP(0x40,$inp));
+       &xorps  ($inout5,$inout5);
+       &call   ("_aesni_encrypt6");
+       &movups (&QWP(0,$out),$inout0);
+       &movups (&QWP(0x10,$out),$inout1);
+       &movups (&QWP(0x20,$out),$inout2);
+       &movups (&QWP(0x30,$out),$inout3);
+       &movups (&QWP(0x40,$out),$inout4);
+       jmp     (&label("ecb_ret"));
+
+&set_label("ecb_enc_one",16);
+       if ($inline)
+       {   &aesni_inline_generate1("enc");     }
+       else
+       {   &call       ("_aesni_encrypt1");    }
+       &movups (&QWP(0,$out),$inout0);
+       &jmp    (&label("ecb_ret"));
+
+&set_label("ecb_enc_two",16);
+       &xorps  ($inout2,$inout2);
+       &call   ("_aesni_encrypt3");
+       &movups (&QWP(0,$out),$inout0);
+       &movups (&QWP(0x10,$out),$inout1);
+       &jmp    (&label("ecb_ret"));
+
+&set_label("ecb_enc_three",16);
+       &call   ("_aesni_encrypt3");
+       &movups (&QWP(0,$out),$inout0);
+       &movups (&QWP(0x10,$out),$inout1);
+       &movups (&QWP(0x20,$out),$inout2);
+       &jmp    (&label("ecb_ret"));
+
+&set_label("ecb_enc_four",16);
+       &call   ("_aesni_encrypt4");
+       &movups (&QWP(0,$out),$inout0);
+       &movups (&QWP(0x10,$out),$inout1);
+       &movups (&QWP(0x20,$out),$inout2);
+       &movups (&QWP(0x30,$out),$inout3);
+       &jmp    (&label("ecb_ret"));
+######################################################################
+&set_label("ecb_decrypt",16);
+       &mov    ($key_,$key);           # backup $key
+       &mov    ($rounds_,$rounds);     # backup $rounds
+       &cmp    ($len,0x60);
+       &jb     (&label("ecb_dec_tail"));
+
+       &movdqu ($inout0,&QWP(0,$inp));
+       &movdqu ($inout1,&QWP(0x10,$inp));
+       &movdqu ($inout2,&QWP(0x20,$inp));
+       &movdqu ($inout3,&QWP(0x30,$inp));
+       &movdqu ($inout4,&QWP(0x40,$inp));
+       &movdqu ($inout5,&QWP(0x50,$inp));
+       &lea    ($inp,&DWP(0x60,$inp));
+       &sub    ($len,0x60);
+       &jmp    (&label("ecb_dec_loop6_enter"));
+
+&set_label("ecb_dec_loop6",16);
+       &movups (&QWP(0,$out),$inout0);
+       &movdqu ($inout0,&QWP(0,$inp));
+       &movups (&QWP(0x10,$out),$inout1);
+       &movdqu ($inout1,&QWP(0x10,$inp));
+       &movups (&QWP(0x20,$out),$inout2);
+       &movdqu ($inout2,&QWP(0x20,$inp));
+       &movups (&QWP(0x30,$out),$inout3);
+       &movdqu ($inout3,&QWP(0x30,$inp));
+       &movups (&QWP(0x40,$out),$inout4);
+       &movdqu ($inout4,&QWP(0x40,$inp));
+       &movups (&QWP(0x50,$out),$inout5);
+       &lea    ($out,&DWP(0x60,$out));
+       &movdqu ($inout5,&QWP(0x50,$inp));
+       &lea    ($inp,&DWP(0x60,$inp));
+&set_label("ecb_dec_loop6_enter");
+
+       &call   ("_aesni_decrypt6");
+
+       &mov    ($key,$key_);           # restore $key
+       &mov    ($rounds,$rounds_);     # restore $rounds
+       &sub    ($len,0x60);
+       &jnc    (&label("ecb_dec_loop6"));
+
+       &movups (&QWP(0,$out),$inout0);
+       &movups (&QWP(0x10,$out),$inout1);
+       &movups (&QWP(0x20,$out),$inout2);
+       &movups (&QWP(0x30,$out),$inout3);
+       &movups (&QWP(0x40,$out),$inout4);
+       &movups (&QWP(0x50,$out),$inout5);
+       &lea    ($out,&DWP(0x60,$out));
+       &add    ($len,0x60);
+       &jz     (&label("ecb_ret"));
+
+&set_label("ecb_dec_tail");
+       &movups ($inout0,&QWP(0,$inp));
+       &cmp    ($len,0x20);
+       &jb     (&label("ecb_dec_one"));
+       &movups ($inout1,&QWP(0x10,$inp));
+       &je     (&label("ecb_dec_two"));
+       &movups ($inout2,&QWP(0x20,$inp));
+       &cmp    ($len,0x40);
+       &jb     (&label("ecb_dec_three"));
+       &movups ($inout3,&QWP(0x30,$inp));
+       &je     (&label("ecb_dec_four"));
+       &movups ($inout4,&QWP(0x40,$inp));
+       &xorps  ($inout5,$inout5);
+       &call   ("_aesni_decrypt6");
+       &movups (&QWP(0,$out),$inout0);
+       &movups (&QWP(0x10,$out),$inout1);
+       &movups (&QWP(0x20,$out),$inout2);
+       &movups (&QWP(0x30,$out),$inout3);
+       &movups (&QWP(0x40,$out),$inout4);
+       &jmp    (&label("ecb_ret"));
+
+&set_label("ecb_dec_one",16);
+       if ($inline)
+       {   &aesni_inline_generate1("dec");     }
+       else
+       {   &call       ("_aesni_decrypt1");    }
+       &movups (&QWP(0,$out),$inout0);
+       &jmp    (&label("ecb_ret"));
+
+&set_label("ecb_dec_two",16);
+       &xorps  ($inout2,$inout2);
+       &call   ("_aesni_decrypt3");
+       &movups (&QWP(0,$out),$inout0);
+       &movups (&QWP(0x10,$out),$inout1);
+       &jmp    (&label("ecb_ret"));
+
+&set_label("ecb_dec_three",16);
+       &call   ("_aesni_decrypt3");
+       &movups (&QWP(0,$out),$inout0);
+       &movups (&QWP(0x10,$out),$inout1);
+       &movups (&QWP(0x20,$out),$inout2);
+       &jmp    (&label("ecb_ret"));
+
+&set_label("ecb_dec_four",16);
+       &call   ("_aesni_decrypt4");
+       &movups (&QWP(0,$out),$inout0);
+       &movups (&QWP(0x10,$out),$inout1);
+       &movups (&QWP(0x20,$out),$inout2);
+       &movups (&QWP(0x30,$out),$inout3);
+
+&set_label("ecb_ret");
+&function_end("aesni_ecb_encrypt");
+
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{ my $cmac=$inout1;
+&function_begin("aesni_ccm64_encrypt_blocks");
+       &mov    ($inp,&wparam(0));
+       &mov    ($out,&wparam(1));
+       &mov    ($len,&wparam(2));
+       &mov    ($key,&wparam(3));
+       &mov    ($rounds_,&wparam(4));
+       &mov    ($rounds,&wparam(5));
+       &mov    ($key_,"esp");
+       &sub    ("esp",60);
+       &and    ("esp",-16);                    # align stack
+       &mov    (&DWP(48,"esp"),$key_);
+
+       &movdqu ($ivec,&QWP(0,$rounds_));       # load ivec
+       &movdqu ($cmac,&QWP(0,$rounds));        # load cmac
+       &mov    ($rounds,&DWP(240,$key));
+
+       # compose byte-swap control mask for pshufb on stack
+       &mov    (&DWP(0,"esp"),0x0c0d0e0f);
+       &mov    (&DWP(4,"esp"),0x08090a0b);
+       &mov    (&DWP(8,"esp"),0x04050607);
+       &mov    (&DWP(12,"esp"),0x00010203);
+
+       # compose counter increment vector on stack
+       &mov    ($rounds_,1);
+       &xor    ($key_,$key_);
+       &mov    (&DWP(16,"esp"),$rounds_);
+       &mov    (&DWP(20,"esp"),$key_);
+       &mov    (&DWP(24,"esp"),$key_);
+       &mov    (&DWP(28,"esp"),$key_);
+
+       &shr    ($rounds,1);
+       &lea    ($key_,&DWP(0,$key));
+       &movdqa ($inout3,&QWP(0,"esp"));
+       &movdqa ($inout0,$ivec);
+       &mov    ($rounds_,$rounds);
+       &pshufb ($ivec,$inout3);
+
+&set_label("ccm64_enc_outer");
+       &$movekey       ($rndkey0,&QWP(0,$key_));
+       &mov            ($rounds,$rounds_);
+       &movups         ($in0,&QWP(0,$inp));
+
+       &xorps          ($inout0,$rndkey0);
+       &$movekey       ($rndkey1,&QWP(16,$key_));
+       &xorps          ($rndkey0,$in0);
+       &lea            ($key,&DWP(32,$key_));
+       &xorps          ($cmac,$rndkey0);               # cmac^=inp
+       &$movekey       ($rndkey0,&QWP(0,$key));
+
+&set_label("ccm64_enc2_loop");
+       &aesenc         ($inout0,$rndkey1);
+       &dec            ($rounds);
+       &aesenc         ($cmac,$rndkey1);
+       &$movekey       ($rndkey1,&QWP(16,$key));
+       &aesenc         ($inout0,$rndkey0);
+       &lea            ($key,&DWP(32,$key));
+       &aesenc         ($cmac,$rndkey0);
+       &$movekey       ($rndkey0,&QWP(0,$key));
+       &jnz            (&label("ccm64_enc2_loop"));
+       &aesenc         ($inout0,$rndkey1);
+       &aesenc         ($cmac,$rndkey1);
+       &paddq          ($ivec,&QWP(16,"esp"));
+       &aesenclast     ($inout0,$rndkey0);
+       &aesenclast     ($cmac,$rndkey0);
+
+       &dec    ($len);
+       &lea    ($inp,&DWP(16,$inp));
+       &xorps  ($in0,$inout0);                 # inp^=E(ivec)
+       &movdqa ($inout0,$ivec);
+       &movups (&QWP(0,$out),$in0);            # save output
+       &lea    ($out,&DWP(16,$out));
+       &pshufb ($inout0,$inout3);
+       &jnz    (&label("ccm64_enc_outer"));
+
+       &mov    ("esp",&DWP(48,"esp"));
+       &mov    ($out,&wparam(5));
+       &movups (&QWP(0,$out),$cmac);
+&function_end("aesni_ccm64_encrypt_blocks");
+
+&function_begin("aesni_ccm64_decrypt_blocks");
+       &mov    ($inp,&wparam(0));
+       &mov    ($out,&wparam(1));
+       &mov    ($len,&wparam(2));
+       &mov    ($key,&wparam(3));
+       &mov    ($rounds_,&wparam(4));
+       &mov    ($rounds,&wparam(5));
+       &mov    ($key_,"esp");
+       &sub    ("esp",60);
+       &and    ("esp",-16);                    # align stack
+       &mov    (&DWP(48,"esp"),$key_);
+
+       &movdqu ($ivec,&QWP(0,$rounds_));       # load ivec
+       &movdqu ($cmac,&QWP(0,$rounds));        # load cmac
+       &mov    ($rounds,&DWP(240,$key));
+
+       # compose byte-swap control mask for pshufb on stack
+       &mov    (&DWP(0,"esp"),0x0c0d0e0f);
+       &mov    (&DWP(4,"esp"),0x08090a0b);
+       &mov    (&DWP(8,"esp"),0x04050607);
+       &mov    (&DWP(12,"esp"),0x00010203);
+
+       # compose counter increment vector on stack
+       &mov    ($rounds_,1);
+       &xor    ($key_,$key_);
+       &mov    (&DWP(16,"esp"),$rounds_);
+       &mov    (&DWP(20,"esp"),$key_);
+       &mov    (&DWP(24,"esp"),$key_);
+       &mov    (&DWP(28,"esp"),$key_);
+
+       &movdqa ($inout3,&QWP(0,"esp"));        # bswap mask
+       &movdqa ($inout0,$ivec);
+
+       &mov    ($key_,$key);
+       &mov    ($rounds_,$rounds);
+
+       &pshufb ($ivec,$inout3);
+       if ($inline)
+       {   &aesni_inline_generate1("enc");     }
+       else
+       {   &call       ("_aesni_encrypt1");    }
+       &movups ($in0,&QWP(0,$inp));            # load inp
+       &paddq  ($ivec,&QWP(16,"esp"));
+       &lea    ($inp,&QWP(16,$inp));
+       &jmp    (&label("ccm64_dec_outer"));
+
+&set_label("ccm64_dec_outer",16);
+       &xorps  ($in0,$inout0);                 # inp ^= E(ivec)
+       &movdqa ($inout0,$ivec);
+       &mov    ($rounds,$rounds_);
+       &movups (&QWP(0,$out),$in0);            # save output
+       &lea    ($out,&DWP(16,$out));
+       &pshufb ($inout0,$inout3);
+
+       &sub    ($len,1);
+       &jz     (&label("ccm64_dec_break"));
+
+       &$movekey       ($rndkey0,&QWP(0,$key_));
+       &shr            ($rounds,1);
+       &$movekey       ($rndkey1,&QWP(16,$key_));
+       &xorps          ($in0,$rndkey0);
+       &lea            ($key,&DWP(32,$key_));
+       &xorps          ($inout0,$rndkey0);
+       &xorps          ($cmac,$in0);           # cmac^=out
+       &$movekey       ($rndkey0,&QWP(0,$key));
+
+&set_label("ccm64_dec2_loop");
+       &aesenc         ($inout0,$rndkey1);
+       &dec            ($rounds);
+       &aesenc         ($cmac,$rndkey1);
+       &$movekey       ($rndkey1,&QWP(16,$key));
+       &aesenc         ($inout0,$rndkey0);
+       &lea            ($key,&DWP(32,$key));
+       &aesenc         ($cmac,$rndkey0);
+       &$movekey       ($rndkey0,&QWP(0,$key));
+       &jnz            (&label("ccm64_dec2_loop"));
+       &movups         ($in0,&QWP(0,$inp));    # load inp
+       &paddq          ($ivec,&QWP(16,"esp"));
+       &aesenc         ($inout0,$rndkey1);
+       &aesenc         ($cmac,$rndkey1);
+       &lea            ($inp,&QWP(16,$inp));
+       &aesenclast     ($inout0,$rndkey0);
+       &aesenclast     ($cmac,$rndkey0);
+       &jmp    (&label("ccm64_dec_outer"));
+
+&set_label("ccm64_dec_break",16);
+       &mov    ($key,$key_);
+       if ($inline)
+       {   &aesni_inline_generate1("enc",$cmac,$in0);  }
+       else
+       {   &call       ("_aesni_encrypt1",$cmac);      }
+
+       &mov    ("esp",&DWP(48,"esp"));
+       &mov    ($out,&wparam(5));
+       &movups (&QWP(0,$out),$cmac);
+&function_end("aesni_ccm64_decrypt_blocks");
+}
+
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+# stack layout:
+#      0       pshufb mask
+#      16      vector addend: 0,6,6,6
+#      32      counter-less ivec
+#      48      1st triplet of counter vector
+#      64      2nd triplet of counter vector
+#      80      saved %esp
+
+&function_begin("aesni_ctr32_encrypt_blocks");
+       &mov    ($inp,&wparam(0));
+       &mov    ($out,&wparam(1));
+       &mov    ($len,&wparam(2));
+       &mov    ($key,&wparam(3));
+       &mov    ($rounds_,&wparam(4));
+       &mov    ($key_,"esp");
+       &sub    ("esp",88);
+       &and    ("esp",-16);                    # align stack
+       &mov    (&DWP(80,"esp"),$key_);
+
+       &cmp    ($len,1);
+       &je     (&label("ctr32_one_shortcut"));
+
+       &movdqu ($inout5,&QWP(0,$rounds_));     # load ivec
+
+       # compose byte-swap control mask for pshufb on stack
+       &mov    (&DWP(0,"esp"),0x0c0d0e0f);
+       &mov    (&DWP(4,"esp"),0x08090a0b);
+       &mov    (&DWP(8,"esp"),0x04050607);
+       &mov    (&DWP(12,"esp"),0x00010203);
+
+       # compose counter increment vector on stack
+       &mov    ($rounds,6);
+       &xor    ($key_,$key_);
+       &mov    (&DWP(16,"esp"),$rounds);
+       &mov    (&DWP(20,"esp"),$rounds);
+       &mov    (&DWP(24,"esp"),$rounds);
+       &mov    (&DWP(28,"esp"),$key_);
+
+       &pextrd ($rounds_,$inout5,3);           # pull 32-bit counter
+       &pinsrd ($inout5,$key_,3);              # wipe 32-bit counter
+
+       &mov    ($rounds,&DWP(240,$key));       # key->rounds
+
+       # compose 2 vectors of 3x32-bit counters
+       &bswap  ($rounds_);
+       &pxor   ($rndkey1,$rndkey1);
+       &pxor   ($rndkey0,$rndkey0);
+       &movdqa ($inout0,&QWP(0,"esp"));        # load byte-swap mask
+       &pinsrd ($rndkey1,$rounds_,0);
+       &lea    ($key_,&DWP(3,$rounds_));
+       &pinsrd ($rndkey0,$key_,0);
+       &inc    ($rounds_);
+       &pinsrd ($rndkey1,$rounds_,1);
+       &inc    ($key_);
+       &pinsrd ($rndkey0,$key_,1);
+       &inc    ($rounds_);
+       &pinsrd ($rndkey1,$rounds_,2);
+       &inc    ($key_);
+       &pinsrd ($rndkey0,$key_,2);
+       &movdqa (&QWP(48,"esp"),$rndkey1);      # save 1st triplet
+       &pshufb ($rndkey1,$inout0);             # byte swap
+       &movdqa (&QWP(64,"esp"),$rndkey0);      # save 2nd triplet
+       &pshufb ($rndkey0,$inout0);             # byte swap
+
+       &pshufd ($inout0,$rndkey1,3<<6);        # place counter to upper dword
+       &pshufd ($inout1,$rndkey1,2<<6);
+       &cmp    ($len,6);
+       &jb     (&label("ctr32_tail"));
+       &movdqa (&QWP(32,"esp"),$inout5);       # save counter-less ivec
+       &shr    ($rounds,1);
+       &mov    ($key_,$key);                   # backup $key
+       &mov    ($rounds_,$rounds);             # backup $rounds
+       &sub    ($len,6);
+       &jmp    (&label("ctr32_loop6"));
+
+&set_label("ctr32_loop6",16);
+       &pshufd ($inout2,$rndkey1,1<<6);
+       &movdqa ($rndkey1,&QWP(32,"esp"));      # pull counter-less ivec
+       &pshufd ($inout3,$rndkey0,3<<6);
+       &por    ($inout0,$rndkey1);             # merge counter-less ivec
+       &pshufd ($inout4,$rndkey0,2<<6);
+       &por    ($inout1,$rndkey1);
+       &pshufd ($inout5,$rndkey0,1<<6);
+       &por    ($inout2,$rndkey1);
+       &por    ($inout3,$rndkey1);
+       &por    ($inout4,$rndkey1);
+       &por    ($inout5,$rndkey1);
+
+       # inlining _aesni_encrypt6's prologue gives ~4% improvement...
+       &$movekey       ($rndkey0,&QWP(0,$key_));
+       &$movekey       ($rndkey1,&QWP(16,$key_));
+       &lea            ($key,&DWP(32,$key_));
+       &dec            ($rounds);
+       &pxor           ($inout0,$rndkey0);
+       &pxor           ($inout1,$rndkey0);
+       &aesenc         ($inout0,$rndkey1);
+       &pxor           ($inout2,$rndkey0);
+       &aesenc         ($inout1,$rndkey1);
+       &pxor           ($inout3,$rndkey0);
+       &aesenc         ($inout2,$rndkey1);
+       &pxor           ($inout4,$rndkey0);
+       &aesenc         ($inout3,$rndkey1);
+       &pxor           ($inout5,$rndkey0);
+       &aesenc         ($inout4,$rndkey1);
+       &$movekey       ($rndkey0,&QWP(0,$key));
+       &aesenc         ($inout5,$rndkey1);
+
+       &call           (&label("_aesni_encrypt6_enter"));
+
+       &movups ($rndkey1,&QWP(0,$inp));
+       &movups ($rndkey0,&QWP(0x10,$inp));
+       &xorps  ($inout0,$rndkey1);
+       &movups ($rndkey1,&QWP(0x20,$inp));
+       &xorps  ($inout1,$rndkey0);
+       &movups (&QWP(0,$out),$inout0);
+       &movdqa ($rndkey0,&QWP(16,"esp"));      # load increment
+       &xorps  ($inout2,$rndkey1);
+       &movdqa ($rndkey1,&QWP(48,"esp"));      # load 1st triplet
+       &movups (&QWP(0x10,$out),$inout1);
+       &movups (&QWP(0x20,$out),$inout2);
+
+       &paddd  ($rndkey1,$rndkey0);            # 1st triplet increment
+       &paddd  ($rndkey0,&QWP(64,"esp"));      # 2nd triplet increment
+       &movdqa ($inout0,&QWP(0,"esp"));        # load byte swap mask
+
+       &movups ($inout1,&QWP(0x30,$inp));
+       &movups ($inout2,&QWP(0x40,$inp));
+       &xorps  ($inout3,$inout1);
+       &movups ($inout1,&QWP(0x50,$inp));
+       &lea    ($inp,&DWP(0x60,$inp));
+       &movdqa (&QWP(48,"esp"),$rndkey1);      # save 1st triplet
+       &pshufb ($rndkey1,$inout0);             # byte swap
+       &xorps  ($inout4,$inout2);
+       &movups (&QWP(0x30,$out),$inout3);
+       &xorps  ($inout5,$inout1);
+       &movdqa (&QWP(64,"esp"),$rndkey0);      # save 2nd triplet
+       &pshufb ($rndkey0,$inout0);             # byte swap
+       &movups (&QWP(0x40,$out),$inout4);
+       &pshufd ($inout0,$rndkey1,3<<6);
+       &movups (&QWP(0x50,$out),$inout5);
+       &lea    ($out,&DWP(0x60,$out));
+
+       &mov    ($rounds,$rounds_);
+       &pshufd ($inout1,$rndkey1,2<<6);
+       &sub    ($len,6);
+       &jnc    (&label("ctr32_loop6"));
+
+       &add    ($len,6);
+       &jz     (&label("ctr32_ret"));
+       &mov    ($key,$key_);
+       &lea    ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
+       &movdqa ($inout5,&QWP(32,"esp"));       # pull count-less ivec
+
+&set_label("ctr32_tail");
+       &por    ($inout0,$inout5);
+       &cmp    ($len,2);
+       &jb     (&label("ctr32_one"));
+
+       &pshufd ($inout2,$rndkey1,1<<6);
+       &por    ($inout1,$inout5);
+       &je     (&label("ctr32_two"));
+
+       &pshufd ($inout3,$rndkey0,3<<6);
+       &por    ($inout2,$inout5);
+       &cmp    ($len,4);
+       &jb     (&label("ctr32_three"));
+
+       &pshufd ($inout4,$rndkey0,2<<6);
+       &por    ($inout3,$inout5);
+       &je     (&label("ctr32_four"));
+
+       &por    ($inout4,$inout5);
+       &call   ("_aesni_encrypt6");
+       &movups ($rndkey1,&QWP(0,$inp));
+       &movups ($rndkey0,&QWP(0x10,$inp));
+       &xorps  ($inout0,$rndkey1);
+       &movups ($rndkey1,&QWP(0x20,$inp));
+       &xorps  ($inout1,$rndkey0);
+       &movups ($rndkey0,&QWP(0x30,$inp));
+       &xorps  ($inout2,$rndkey1);
+       &movups ($rndkey1,&QWP(0x40,$inp));
+       &xorps  ($inout3,$rndkey0);
+       &movups (&QWP(0,$out),$inout0);
+       &xorps  ($inout4,$rndkey1);
+       &movups (&QWP(0x10,$out),$inout1);
+       &movups (&QWP(0x20,$out),$inout2);
+       &movups (&QWP(0x30,$out),$inout3);
+       &movups (&QWP(0x40,$out),$inout4);
+       &jmp    (&label("ctr32_ret"));
+
+&set_label("ctr32_one_shortcut",16);
+       &movups ($inout0,&QWP(0,$rounds_));     # load ivec
+       &mov    ($rounds,&DWP(240,$key));
+       
+&set_label("ctr32_one");
+       if ($inline)
+       {   &aesni_inline_generate1("enc");     }
+       else
+       {   &call       ("_aesni_encrypt1");    }
+       &movups ($in0,&QWP(0,$inp));
+       &xorps  ($in0,$inout0);
+       &movups (&QWP(0,$out),$in0);
+       &jmp    (&label("ctr32_ret"));
+
+&set_label("ctr32_two",16);
+       &call   ("_aesni_encrypt3");
+       &movups ($inout3,&QWP(0,$inp));
+       &movups ($inout4,&QWP(0x10,$inp));
+       &xorps  ($inout0,$inout3);
+       &xorps  ($inout1,$inout4);
+       &movups (&QWP(0,$out),$inout0);
+       &movups (&QWP(0x10,$out),$inout1);
+       &jmp    (&label("ctr32_ret"));
+
+&set_label("ctr32_three",16);
+       &call   ("_aesni_encrypt3");
+       &movups ($inout3,&QWP(0,$inp));
+       &movups ($inout4,&QWP(0x10,$inp));
+       &xorps  ($inout0,$inout3);
+       &movups ($inout5,&QWP(0x20,$inp));
+       &xorps  ($inout1,$inout4);
+       &movups (&QWP(0,$out),$inout0);
+       &xorps  ($inout2,$inout5);
+       &movups (&QWP(0x10,$out),$inout1);
+       &movups (&QWP(0x20,$out),$inout2);
+       &jmp    (&label("ctr32_ret"));
+
+&set_label("ctr32_four",16);
+       &call   ("_aesni_encrypt4");
+       &movups ($inout4,&QWP(0,$inp));
+       &movups ($inout5,&QWP(0x10,$inp));
+       &movups ($rndkey1,&QWP(0x20,$inp));
+       &xorps  ($inout0,$inout4);
+       &movups ($rndkey0,&QWP(0x30,$inp));
+       &xorps  ($inout1,$inout5);
+       &movups (&QWP(0,$out),$inout0);
+       &xorps  ($inout2,$rndkey1);
+       &movups (&QWP(0x10,$out),$inout1);
+       &xorps  ($inout3,$rndkey0);
+       &movups (&QWP(0x20,$out),$inout2);
+       &movups (&QWP(0x30,$out),$inout3);
+
+&set_label("ctr32_ret");
+       &mov    ("esp",&DWP(80,"esp"));
+&function_end("aesni_ctr32_encrypt_blocks");
+
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#      const AES_KEY *key1, const AES_KEY *key2
+#      const unsigned char iv[16]);
+#
+{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
+
+&function_begin("aesni_xts_encrypt");
+       &mov    ($key,&wparam(4));              # key2
+       &mov    ($inp,&wparam(5));              # clear-text tweak
+
+       &mov    ($rounds,&DWP(240,$key));       # key2->rounds
+       &movups ($inout0,&QWP(0,$inp));
+       if ($inline)
+       {   &aesni_inline_generate1("enc");     }
+       else
+       {   &call       ("_aesni_encrypt1");    }
+
+       &mov    ($inp,&wparam(0));
+       &mov    ($out,&wparam(1));
+       &mov    ($len,&wparam(2));
+       &mov    ($key,&wparam(3));              # key1
+
+       &mov    ($key_,"esp");
+       &sub    ("esp",16*7+8);
+       &mov    ($rounds,&DWP(240,$key));       # key1->rounds
+       &and    ("esp",-16);                    # align stack
+
+       &mov    (&DWP(16*6+0,"esp"),0x87);      # compose the magic constant
+       &mov    (&DWP(16*6+4,"esp"),0);
+       &mov    (&DWP(16*6+8,"esp"),1);
+       &mov    (&DWP(16*6+12,"esp"),0);
+       &mov    (&DWP(16*7+0,"esp"),$len);      # save original $len
+       &mov    (&DWP(16*7+4,"esp"),$key_);     # save original %esp
+
+       &movdqa ($tweak,$inout0);
+       &pxor   ($twtmp,$twtmp);
+       &movdqa ($twmask,&QWP(6*16,"esp"));     # 0x0...010...87
+       &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+
+       &and    ($len,-16);
+       &mov    ($key_,$key);                   # backup $key
+       &mov    ($rounds_,$rounds);             # backup $rounds
+       &sub    ($len,16*6);
+       &jc     (&label("xts_enc_short"));
+
+       &shr    ($rounds,1);
+       &mov    ($rounds_,$rounds);
+       &jmp    (&label("xts_enc_loop6"));
+
+&set_label("xts_enc_loop6",16);
+       for ($i=0;$i<4;$i++) {
+           &pshufd     ($twres,$twtmp,0x13);
+           &pxor       ($twtmp,$twtmp);
+           &movdqa     (&QWP(16*$i,"esp"),$tweak);
+           &paddq      ($tweak,$tweak);        # &psllq($tweak,1);
+           &pand       ($twres,$twmask);       # isolate carry and residue
+           &pcmpgtd    ($twtmp,$tweak);        # broadcast upper bits
+           &pxor       ($tweak,$twres);
+       }
+       &pshufd ($inout5,$twtmp,0x13);
+       &movdqa (&QWP(16*$i++,"esp"),$tweak);
+       &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+        &$movekey      ($rndkey0,&QWP(0,$key_));
+       &pand   ($inout5,$twmask);              # isolate carry and residue
+        &movups        ($inout0,&QWP(0,$inp)); # load input
+       &pxor   ($inout5,$tweak);
+
+       # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+       &movdqu ($inout1,&QWP(16*1,$inp));
+        &xorps         ($inout0,$rndkey0);     # input^=rndkey[0]
+       &movdqu ($inout2,&QWP(16*2,$inp));
+        &pxor          ($inout1,$rndkey0);
+       &movdqu ($inout3,&QWP(16*3,$inp));
+        &pxor          ($inout2,$rndkey0);
+       &movdqu ($inout4,&QWP(16*4,$inp));
+        &pxor          ($inout3,$rndkey0);
+       &movdqu ($rndkey1,&QWP(16*5,$inp));
+        &pxor          ($inout4,$rndkey0);
+       &lea    ($inp,&DWP(16*6,$inp));
+       &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
+       &movdqa (&QWP(16*$i,"esp"),$inout5);    # save last tweak
+       &pxor   ($inout5,$rndkey1);
+
+        &$movekey      ($rndkey1,&QWP(16,$key_));
+        &lea           ($key,&DWP(32,$key_));
+       &pxor   ($inout1,&QWP(16*1,"esp"));
+        &aesenc        ($inout0,$rndkey1);
+       &pxor   ($inout2,&QWP(16*2,"esp"));
+        &aesenc        ($inout1,$rndkey1);
+       &pxor   ($inout3,&QWP(16*3,"esp"));
+        &dec           ($rounds);
+        &aesenc        ($inout2,$rndkey1);
+       &pxor   ($inout4,&QWP(16*4,"esp"));
+        &aesenc        ($inout3,$rndkey1);
+       &pxor           ($inout5,$rndkey0);
+        &aesenc        ($inout4,$rndkey1);
+        &$movekey      ($rndkey0,&QWP(0,$key));
+        &aesenc        ($inout5,$rndkey1);
+       &call           (&label("_aesni_encrypt6_enter"));
+
+       &movdqa ($tweak,&QWP(16*5,"esp"));      # last tweak
+       &pxor   ($twtmp,$twtmp);
+       &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
+       &pcmpgtd        ($twtmp,$tweak);                # broadcast upper bits
+       &xorps  ($inout1,&QWP(16*1,"esp"));
+       &movups (&QWP(16*0,$out),$inout0);      # write output
+       &xorps  ($inout2,&QWP(16*2,"esp"));
+       &movups (&QWP(16*1,$out),$inout1);
+       &xorps  ($inout3,&QWP(16*3,"esp"));
+       &movups (&QWP(16*2,$out),$inout2);
+       &xorps  ($inout4,&QWP(16*4,"esp"));
+       &movups (&QWP(16*3,$out),$inout3);
+       &xorps  ($inout5,$tweak);
+       &movups (&QWP(16*4,$out),$inout4);
+       &pshufd ($twres,$twtmp,0x13);
+       &movups (&QWP(16*5,$out),$inout5);
+       &lea    ($out,&DWP(16*6,$out));
+       &movdqa ($twmask,&QWP(16*6,"esp"));     # 0x0...010...87
+
+       &pxor   ($twtmp,$twtmp);
+       &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+       &pand   ($twres,$twmask);               # isolate carry and residue
+       &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+       &mov    ($rounds,$rounds_);             # restore $rounds
+       &pxor   ($tweak,$twres);
+
+       &sub    ($len,16*6);
+       &jnc    (&label("xts_enc_loop6"));
+
+       &lea    ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
+       &mov    ($key,$key_);                   # restore $key
+       &mov    ($rounds_,$rounds);
+
+&set_label("xts_enc_short");
+       &add    ($len,16*6);
+       &jz     (&label("xts_enc_done6x"));
+
+       &movdqa ($inout3,$tweak);               # put aside previous tweak
+       &cmp    ($len,0x20);
+       &jb     (&label("xts_enc_one"));
+
+       &pshufd ($twres,$twtmp,0x13);
+       &pxor   ($twtmp,$twtmp);
+       &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+       &pand   ($twres,$twmask);               # isolate carry and residue
+       &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+       &pxor   ($tweak,$twres);
+       &je     (&label("xts_enc_two"));
+
+       &pshufd ($twres,$twtmp,0x13);
+       &pxor   ($twtmp,$twtmp);
+       &movdqa ($inout4,$tweak);               # put aside previous tweak
+       &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+       &pand   ($twres,$twmask);               # isolate carry and residue
+       &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+       &pxor   ($tweak,$twres);
+       &cmp    ($len,0x40);
+       &jb     (&label("xts_enc_three"));
+
+       &pshufd ($twres,$twtmp,0x13);
+       &pxor   ($twtmp,$twtmp);
+       &movdqa ($inout5,$tweak);               # put aside previous tweak
+       &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+       &pand   ($twres,$twmask);               # isolate carry and residue
+       &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+       &pxor   ($tweak,$twres);
+       &movdqa (&QWP(16*0,"esp"),$inout3);
+       &movdqa (&QWP(16*1,"esp"),$inout4);
+       &je     (&label("xts_enc_four"));
+
+       &movdqa (&QWP(16*2,"esp"),$inout5);
+       &pshufd ($inout5,$twtmp,0x13);
+       &movdqa (&QWP(16*3,"esp"),$tweak);
+       &paddq  ($tweak,$tweak);                # &psllq($inout0,1);
+       &pand   ($inout5,$twmask);              # isolate carry and residue
+       &pxor   ($inout5,$tweak);
+
+       &movdqu ($inout0,&QWP(16*0,$inp));      # load input
+       &movdqu ($inout1,&QWP(16*1,$inp));
+       &movdqu ($inout2,&QWP(16*2,$inp));
+       &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
+       &movdqu ($inout3,&QWP(16*3,$inp));
+       &pxor   ($inout1,&QWP(16*1,"esp"));
+       &movdqu ($inout4,&QWP(16*4,$inp));
+       &pxor   ($inout2,&QWP(16*2,"esp"));
+       &lea    ($inp,&DWP(16*5,$inp));
+       &pxor   ($inout3,&QWP(16*3,"esp"));
+       &movdqa (&QWP(16*4,"esp"),$inout5);     # save last tweak
+       &pxor   ($inout4,$inout5);
+
+       &call   ("_aesni_encrypt6");
+
+       &movaps ($tweak,&QWP(16*4,"esp"));      # last tweak
+       &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
+       &xorps  ($inout1,&QWP(16*1,"esp"));
+       &xorps  ($inout2,&QWP(16*2,"esp"));
+       &movups (&QWP(16*0,$out),$inout0);      # write output
+       &xorps  ($inout3,&QWP(16*3,"esp"));
+       &movups (&QWP(16*1,$out),$inout1);
+       &xorps  ($inout4,$tweak);
+       &movups (&QWP(16*2,$out),$inout2);
+       &movups (&QWP(16*3,$out),$inout3);
+       &movups (&QWP(16*4,$out),$inout4);
+       &lea    ($out,&DWP(16*5,$out));
+       &jmp    (&label("xts_enc_done"));
+
+&set_label("xts_enc_one",16);
+       &movups ($inout0,&QWP(16*0,$inp));      # load input
+       &lea    ($inp,&DWP(16*1,$inp));
+       &xorps  ($inout0,$inout3);              # input^=tweak
+       if ($inline)
+       {   &aesni_inline_generate1("enc");     }
+       else
+       {   &call       ("_aesni_encrypt1");    }
+       &xorps  ($inout0,$inout3);              # output^=tweak
+       &movups (&QWP(16*0,$out),$inout0);      # write output
+       &lea    ($out,&DWP(16*1,$out));
+
+       &movdqa ($tweak,$inout3);               # last tweak
+       &jmp    (&label("xts_enc_done"));
+
+&set_label("xts_enc_two",16);
+       &movaps ($inout4,$tweak);               # put aside last tweak
+
+       &movups ($inout0,&QWP(16*0,$inp));      # load input
+       &movups ($inout1,&QWP(16*1,$inp));
+       &lea    ($inp,&DWP(16*2,$inp));
+       &xorps  ($inout0,$inout3);              # input^=tweak
+       &xorps  ($inout1,$inout4);
+       &xorps  ($inout2,$inout2);
+
+       &call   ("_aesni_encrypt3");
+
+       &xorps  ($inout0,$inout3);              # output^=tweak
+       &xorps  ($inout1,$inout4);
+       &movups (&QWP(16*0,$out),$inout0);      # write output
+       &movups (&QWP(16*1,$out),$inout1);
+       &lea    ($out,&DWP(16*2,$out));
+
+       &movdqa ($tweak,$inout4);               # last tweak
+       &jmp    (&label("xts_enc_done"));
+
+&set_label("xts_enc_three",16);
+       &movaps ($inout5,$tweak);               # put aside last tweak
+       &movups ($inout0,&QWP(16*0,$inp));      # load input
+       &movups ($inout1,&QWP(16*1,$inp));
+       &movups ($inout2,&QWP(16*2,$inp));
+       &lea    ($inp,&DWP(16*3,$inp));
+       &xorps  ($inout0,$inout3);              # input^=tweak
+       &xorps  ($inout1,$inout4);
+       &xorps  ($inout2,$inout5);
+
+       &call   ("_aesni_encrypt3");
+
+       &xorps  ($inout0,$inout3);              # output^=tweak
+       &xorps  ($inout1,$inout4);
+       &xorps  ($inout2,$inout5);
+       &movups (&QWP(16*0,$out),$inout0);      # write output
+       &movups (&QWP(16*1,$out),$inout1);
+       &movups (&QWP(16*2,$out),$inout2);
+       &lea    ($out,&DWP(16*3,$out));
+
+       &movdqa ($tweak,$inout5);               # last tweak
+       &jmp    (&label("xts_enc_done"));
+
+&set_label("xts_enc_four",16);
+       &movaps ($inout4,$tweak);               # put aside last tweak
+
+       &movups ($inout0,&QWP(16*0,$inp));      # load input
+       &movups ($inout1,&QWP(16*1,$inp));
+       &movups ($inout2,&QWP(16*2,$inp));
+       &xorps  ($inout0,&QWP(16*0,"esp"));     # input^=tweak
+       &movups ($inout3,&QWP(16*3,$inp));
+       &lea    ($inp,&DWP(16*4,$inp));
+       &xorps  ($inout1,&QWP(16*1,"esp"));
+       &xorps  ($inout2,$inout5);
+       &xorps  ($inout3,$inout4);
+
+       &call   ("_aesni_encrypt4");
+
+       &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
+       &xorps  ($inout1,&QWP(16*1,"esp"));
+       &xorps  ($inout2,$inout5);
+       &movups (&QWP(16*0,$out),$inout0);      # write output
+       &xorps  ($inout3,$inout4);
+       &movups (&QWP(16*1,$out),$inout1);
+       &movups (&QWP(16*2,$out),$inout2);
+       &movups (&QWP(16*3,$out),$inout3);
+       &lea    ($out,&DWP(16*4,$out));
+
+       &movdqa ($tweak,$inout4);               # last tweak
+       &jmp    (&label("xts_enc_done"));
+
+&set_label("xts_enc_done6x",16);               # $tweak is pre-calculated
+       &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
+       &and    ($len,15);
+       &jz     (&label("xts_enc_ret"));
+       &movdqa ($inout3,$tweak);
+       &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
+       &jmp    (&label("xts_enc_steal"));
+
+&set_label("xts_enc_done",16);
+       &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
+       &pxor   ($twtmp,$twtmp);
+       &and    ($len,15);
+       &jz     (&label("xts_enc_ret"));
+
+       &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+       &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
+       &pshufd ($inout3,$twtmp,0x13);
+       &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+       &pand   ($inout3,&QWP(16*6,"esp"));     # isolate carry and residue
+       &pxor   ($inout3,$tweak);
+
+&set_label("xts_enc_steal");
+       &movz   ($rounds,&BP(0,$inp));
+       &movz   ($key,&BP(-16,$out));
+       &lea    ($inp,&DWP(1,$inp));
+       &mov    (&BP(-16,$out),&LB($rounds));
+       &mov    (&BP(0,$out),&LB($key));
+       &lea    ($out,&DWP(1,$out));
+       &sub    ($len,1);
+       &jnz    (&label("xts_enc_steal"));
+
+       &sub    ($out,&DWP(16*7+0,"esp"));      # rewind $out
+       &mov    ($key,$key_);                   # restore $key
+       &mov    ($rounds,$rounds_);             # restore $rounds
+
+       &movups ($inout0,&QWP(-16,$out));       # load input
+       &xorps  ($inout0,$inout3);              # input^=tweak
+       if ($inline)
+       {   &aesni_inline_generate1("enc");     }
+       else
+       {   &call       ("_aesni_encrypt1");    }
+       &xorps  ($inout0,$inout3);              # output^=tweak
+       &movups (&QWP(-16,$out),$inout0);       # write output
+
+&set_label("xts_enc_ret");
+       &mov    ("esp",&DWP(16*7+4,"esp"));     # restore %esp
+&function_end("aesni_xts_encrypt");
+
+&function_begin("aesni_xts_decrypt");
+       &mov    ($key,&wparam(4));              # key2
+       &mov    ($inp,&wparam(5));              # clear-text tweak
+
+       &mov    ($rounds,&DWP(240,$key));       # key2->rounds
+       &movups ($inout0,&QWP(0,$inp));
+       if ($inline)
+       {   &aesni_inline_generate1("enc");     }
+       else
+       {   &call       ("_aesni_encrypt1");    }
+
+       &mov    ($inp,&wparam(0));
+       &mov    ($out,&wparam(1));
+       &mov    ($len,&wparam(2));
+       &mov    ($key,&wparam(3));              # key1
+
+       &mov    ($key_,"esp");
+       &sub    ("esp",16*7+8);
+       &and    ("esp",-16);                    # align stack
+
+       &xor    ($rounds_,$rounds_);            # if(len%16) len-=16;
+       &test   ($len,15);
+       &setnz  (&LB($rounds_));
+       &shl    ($rounds_,4);
+       &sub    ($len,$rounds_);
+
+       &mov    (&DWP(16*6+0,"esp"),0x87);      # compose the magic constant
+       &mov    (&DWP(16*6+4,"esp"),0);
+       &mov    (&DWP(16*6+8,"esp"),1);
+       &mov    (&DWP(16*6+12,"esp"),0);
+       &mov    (&DWP(16*7+0,"esp"),$len);      # save original $len
+       &mov    (&DWP(16*7+4,"esp"),$key_);     # save original %esp
+
+       &mov    ($rounds,&DWP(240,$key));       # key1->rounds
+       &mov    ($key_,$key);                   # backup $key
+       &mov    ($rounds_,$rounds);             # backup $rounds
+
+       &movdqa ($tweak,$inout0);
+       &pxor   ($twtmp,$twtmp);
+       &movdqa ($twmask,&QWP(6*16,"esp"));     # 0x0...010...87
+       &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+
+       &and    ($len,-16);
+       &sub    ($len,16*6);
+       &jc     (&label("xts_dec_short"));
+
+       &shr    ($rounds,1);
+       &mov    ($rounds_,$rounds);
+       &jmp    (&label("xts_dec_loop6"));
+
+&set_label("xts_dec_loop6",16);
+       for ($i=0;$i<4;$i++) {
+           &pshufd     ($twres,$twtmp,0x13);
+           &pxor       ($twtmp,$twtmp);
+           &movdqa     (&QWP(16*$i,"esp"),$tweak);
+           &paddq      ($tweak,$tweak);        # &psllq($tweak,1);
+           &pand       ($twres,$twmask);       # isolate carry and residue
+           &pcmpgtd    ($twtmp,$tweak);        # broadcast upper bits
+           &pxor       ($tweak,$twres);
+       }
+       &pshufd ($inout5,$twtmp,0x13);
+       &movdqa (&QWP(16*$i++,"esp"),$tweak);
+       &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+        &$movekey      ($rndkey0,&QWP(0,$key_));
+       &pand   ($inout5,$twmask);              # isolate carry and residue
+        &movups        ($inout0,&QWP(0,$inp)); # load input
+       &pxor   ($inout5,$tweak);
+
+       # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+       &movdqu ($inout1,&QWP(16*1,$inp));
+        &xorps         ($inout0,$rndkey0);     # input^=rndkey[0]
+       &movdqu ($inout2,&QWP(16*2,$inp));
+        &pxor          ($inout1,$rndkey0);
+       &movdqu ($inout3,&QWP(16*3,$inp));
+        &pxor          ($inout2,$rndkey0);
+       &movdqu ($inout4,&QWP(16*4,$inp));
+        &pxor          ($inout3,$rndkey0);
+       &movdqu ($rndkey1,&QWP(16*5,$inp));
+        &pxor          ($inout4,$rndkey0);
+       &lea    ($inp,&DWP(16*6,$inp));
+       &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
+       &movdqa (&QWP(16*$i,"esp"),$inout5);    # save last tweak
+       &pxor   ($inout5,$rndkey1);
+
+        &$movekey      ($rndkey1,&QWP(16,$key_));
+        &lea           ($key,&DWP(32,$key_));
+       &pxor   ($inout1,&QWP(16*1,"esp"));
+        &aesdec        ($inout0,$rndkey1);
+       &pxor   ($inout2,&QWP(16*2,"esp"));
+        &aesdec        ($inout1,$rndkey1);
+       &pxor   ($inout3,&QWP(16*3,"esp"));
+        &dec           ($rounds);
+        &aesdec        ($inout2,$rndkey1);
+       &pxor   ($inout4,&QWP(16*4,"esp"));
+        &aesdec        ($inout3,$rndkey1);
+       &pxor           ($inout5,$rndkey0);
+        &aesdec        ($inout4,$rndkey1);
+        &$movekey      ($rndkey0,&QWP(0,$key));
+        &aesdec        ($inout5,$rndkey1);
+       &call           (&label("_aesni_decrypt6_enter"));
+
+       &movdqa ($tweak,&QWP(16*5,"esp"));      # last tweak
+       &pxor   ($twtmp,$twtmp);
+       &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
+       &pcmpgtd        ($twtmp,$tweak);                # broadcast upper bits
+       &xorps  ($inout1,&QWP(16*1,"esp"));
+       &movups (&QWP(16*0,$out),$inout0);      # write output
+       &xorps  ($inout2,&QWP(16*2,"esp"));
+       &movups (&QWP(16*1,$out),$inout1);
+       &xorps  ($inout3,&QWP(16*3,"esp"));
+       &movups (&QWP(16*2,$out),$inout2);
+       &xorps  ($inout4,&QWP(16*4,"esp"));
+       &movups (&QWP(16*3,$out),$inout3);
+       &xorps  ($inout5,$tweak);
+       &movups (&QWP(16*4,$out),$inout4);
+       &pshufd ($twres,$twtmp,0x13);
+       &movups (&QWP(16*5,$out),$inout5);
+       &lea    ($out,&DWP(16*6,$out));
+       &movdqa ($twmask,&QWP(16*6,"esp"));     # 0x0...010...87
+
+       &pxor   ($twtmp,$twtmp);
+       &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+       &pand   ($twres,$twmask);               # isolate carry and residue
+       &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+       &mov    ($rounds,$rounds_);             # restore $rounds
+       &pxor   ($tweak,$twres);
+
+       &sub    ($len,16*6);
+       &jnc    (&label("xts_dec_loop6"));
+
+       &lea    ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
+       &mov    ($key,$key_);                   # restore $key
+       &mov    ($rounds_,$rounds);
+
+&set_label("xts_dec_short");
+       &add    ($len,16*6);
+       &jz     (&label("xts_dec_done6x"));
+
+       &movdqa ($inout3,$tweak);               # put aside previous tweak
+       &cmp    ($len,0x20);
+       &jb     (&label("xts_dec_one"));
+
+       &pshufd ($twres,$twtmp,0x13);
+       &pxor   ($twtmp,$twtmp);
+       &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+       &pand   ($twres,$twmask);               # isolate carry and residue
+       &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+       &pxor   ($tweak,$twres);
+       &je     (&label("xts_dec_two"));
+
+       &pshufd ($twres,$twtmp,0x13);
+       &pxor   ($twtmp,$twtmp);
+       &movdqa ($inout4,$tweak);               # put aside previous tweak
+       &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+       &pand   ($twres,$twmask);               # isolate carry and residue
+       &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+       &pxor   ($tweak,$twres);
+       &cmp    ($len,0x40);
+       &jb     (&label("xts_dec_three"));
+
+       &pshufd ($twres,$twtmp,0x13);
+       &pxor   ($twtmp,$twtmp);
+       &movdqa ($inout5,$tweak);               # put aside previous tweak
+       &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+       &pand   ($twres,$twmask);               # isolate carry and residue
+       &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+       &pxor   ($tweak,$twres);
+       &movdqa (&QWP(16*0,"esp"),$inout3);
+       &movdqa (&QWP(16*1,"esp"),$inout4);
+       &je     (&label("xts_dec_four"));
+
+       &movdqa (&QWP(16*2,"esp"),$inout5);
+       &pshufd ($inout5,$twtmp,0x13);
+       &movdqa (&QWP(16*3,"esp"),$tweak);
+       &paddq  ($tweak,$tweak);                # &psllq($inout0,1);
+       &pand   ($inout5,$twmask);              # isolate carry and residue
+       &pxor   ($inout5,$tweak);
+
+       &movdqu ($inout0,&QWP(16*0,$inp));      # load input
+       &movdqu ($inout1,&QWP(16*1,$inp));
+       &movdqu ($inout2,&QWP(16*2,$inp));
+       &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
+       &movdqu ($inout3,&QWP(16*3,$inp));
+       &pxor   ($inout1,&QWP(16*1,"esp"));
+       &movdqu ($inout4,&QWP(16*4,$inp));
+       &pxor   ($inout2,&QWP(16*2,"esp"));
+       &lea    ($inp,&DWP(16*5,$inp));
+       &pxor   ($inout3,&QWP(16*3,"esp"));
+       &movdqa (&QWP(16*4,"esp"),$inout5);     # save last tweak
+       &pxor   ($inout4,$inout5);
+
+       &call   ("_aesni_decrypt6");
+
+       &movaps ($tweak,&QWP(16*4,"esp"));      # last tweak
+       &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
+       &xorps  ($inout1,&QWP(16*1,"esp"));
+       &xorps  ($inout2,&QWP(16*2,"esp"));
+       &movups (&QWP(16*0,$out),$inout0);      # write output
+       &xorps  ($inout3,&QWP(16*3,"esp"));
+       &movups (&QWP(16*1,$out),$inout1);
+       &xorps  ($inout4,$tweak);
+       &movups (&QWP(16*2,$out),$inout2);
+       &movups (&QWP(16*3,$out),$inout3);
+       &movups (&QWP(16*4,$out),$inout4);
+       &lea    ($out,&DWP(16*5,$out));
+       &jmp    (&label("xts_dec_done"));
+
+&set_label("xts_dec_one",16);
+       &movups ($inout0,&QWP(16*0,$inp));      # load input
+       &lea    ($inp,&DWP(16*1,$inp));
+       &xorps  ($inout0,$inout3);              # input^=tweak
+       if ($inline)
+       {   &aesni_inline_generate1("dec");     }
+       else
+       {   &call       ("_aesni_decrypt1");    }
+       &xorps  ($inout0,$inout3);              # output^=tweak
+       &movups (&QWP(16*0,$out),$inout0);      # write output
+       &lea    ($out,&DWP(16*1,$out));
+
+       &movdqa ($tweak,$inout3);               # last tweak
+       &jmp    (&label("xts_dec_done"));
+
+&set_label("xts_dec_two",16);
+       &movaps ($inout4,$tweak);               # put aside last tweak
+
+       &movups ($inout0,&QWP(16*0,$inp));      # load input
+       &movups ($inout1,&QWP(16*1,$inp));
+       &lea    ($inp,&DWP(16*2,$inp));
+       &xorps  ($inout0,$inout3);              # input^=tweak
+       &xorps  ($inout1,$inout4);
+
+       &call   ("_aesni_decrypt3");
+
+       &xorps  ($inout0,$inout3);              # output^=tweak
+       &xorps  ($inout1,$inout4);
+       &movups (&QWP(16*0,$out),$inout0);      # write output
+       &movups (&QWP(16*1,$out),$inout1);
+       &lea    ($out,&DWP(16*2,$out));
+
+       &movdqa ($tweak,$inout4);               # last tweak
+       &jmp    (&label("xts_dec_done"));
+
+&set_label("xts_dec_three",16);
+       &movaps ($inout5,$tweak);               # put aside last tweak
+       &movups ($inout0,&QWP(16*0,$inp));      # load input
+       &movups ($inout1,&QWP(16*1,$inp));
+       &movups ($inout2,&QWP(16*2,$inp));
+       &lea    ($inp,&DWP(16*3,$inp));
+       &xorps  ($inout0,$inout3);              # input^=tweak
+       &xorps  ($inout1,$inout4);
+       &xorps  ($inout2,$inout5);
+
+       &call   ("_aesni_decrypt3");
+
+       &xorps  ($inout0,$inout3);              # output^=tweak
+       &xorps  ($inout1,$inout4);
+       &xorps  ($inout2,$inout5);
+       &movups (&QWP(16*0,$out),$inout0);      # write output
+       &movups (&QWP(16*1,$out),$inout1);
+       &movups (&QWP(16*2,$out),$inout2);
+       &lea    ($out,&DWP(16*3,$out));
+
+       &movdqa ($tweak,$inout5);               # last tweak
+       &jmp    (&label("xts_dec_done"));
+
+&set_label("xts_dec_four",16);
+       &movaps ($inout4,$tweak);               # put aside last tweak
+
+       &movups ($inout0,&QWP(16*0,$inp));      # load input
+       &movups ($inout1,&QWP(16*1,$inp));
+       &movups ($inout2,&QWP(16*2,$inp));
+       &xorps  ($inout0,&QWP(16*0,"esp"));     # input^=tweak
+       &movups ($inout3,&QWP(16*3,$inp));
+       &lea    ($inp,&DWP(16*4,$inp));
+       &xorps  ($inout1,&QWP(16*1,"esp"));
+       &xorps  ($inout2,$inout5);
+       &xorps  ($inout3,$inout4);
+
+       &call   ("_aesni_decrypt4");
+
+       &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
+       &xorps  ($inout1,&QWP(16*1,"esp"));
+       &xorps  ($inout2,$inout5);
+       &movups (&QWP(16*0,$out),$inout0);      # write output
+       &xorps  ($inout3,$inout4);
+       &movups (&QWP(16*1,$out),$inout1);
+       &movups (&QWP(16*2,$out),$inout2);
+       &movups (&QWP(16*3,$out),$inout3);
+       &lea    ($out,&DWP(16*4,$out));
+
+       &movdqa ($tweak,$inout4);               # last tweak
+       &jmp    (&label("xts_dec_done"));
+
+&set_label("xts_dec_done6x",16);               # $tweak is pre-calculated
+       &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
+       &and    ($len,15);
+       &jz     (&label("xts_dec_ret"));
+       &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
+       &jmp    (&label("xts_dec_only_one_more"));
+
+&set_label("xts_dec_done",16);
+       &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
+       &pxor   ($twtmp,$twtmp);
+       &and    ($len,15);
+       &jz     (&label("xts_dec_ret"));
+
+       &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+       &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
+       &pshufd ($twres,$twtmp,0x13);
+       &pxor   ($twtmp,$twtmp);
+       &movdqa ($twmask,&QWP(16*6,"esp"));
+       &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+       &pand   ($twres,$twmask);               # isolate carry and residue
+       &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
+       &pxor   ($tweak,$twres);
+
+&set_label("xts_dec_only_one_more");
+       &pshufd ($inout3,$twtmp,0x13);
+       &movdqa ($inout4,$tweak);               # put aside previous tweak
+       &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
+       &pand   ($inout3,$twmask);              # isolate carry and residue
+       &pxor   ($inout3,$tweak);
+
+       &mov    ($key,$key_);                   # restore $key
+       &mov    ($rounds,$rounds_);             # restore $rounds
+
+       &movups ($inout0,&QWP(0,$inp));         # load input
+       &xorps  ($inout0,$inout3);              # input^=tweak
+       if ($inline)
+       {   &aesni_inline_generate1("dec");     }
+       else
+       {   &call       ("_aesni_decrypt1");    }
+       &xorps  ($inout0,$inout3);              # output^=tweak
+       &movups (&QWP(0,$out),$inout0);         # write output
+
+&set_label("xts_dec_steal");
+       &movz   ($rounds,&BP(16,$inp));
+       &movz   ($key,&BP(0,$out));
+       &lea    ($inp,&DWP(1,$inp));
+       &mov    (&BP(0,$out),&LB($rounds));
+       &mov    (&BP(16,$out),&LB($key));
+       &lea    ($out,&DWP(1,$out));
+       &sub    ($len,1);
+       &jnz    (&label("xts_dec_steal"));
+
+       &sub    ($out,&DWP(16*7+0,"esp"));      # rewind $out
+       &mov    ($key,$key_);                   # restore $key
+       &mov    ($rounds,$rounds_);             # restore $rounds
+
+       &movups ($inout0,&QWP(0,$out));         # load input
+       &xorps  ($inout0,$inout4);              # input^=tweak
+       if ($inline)
+       {   &aesni_inline_generate1("dec");     }
+       else
+       {   &call       ("_aesni_decrypt1");    }
+       &xorps  ($inout0,$inout4);              # output^=tweak
+       &movups (&QWP(0,$out),$inout0);         # write output
+
+&set_label("xts_dec_ret");
+       &mov    ("esp",&DWP(16*7+4,"esp"));     # restore %esp
+&function_end("aesni_xts_decrypt");
+}
+}
+
+######################################################################
+# void $PREFIX_cbc_encrypt (const void *inp, void *out,
+#                           size_t length, const AES_KEY *key,
+#                           unsigned char *ivp,const int enc);
+&function_begin("${PREFIX}_cbc_encrypt");
+       &mov    ($inp,&wparam(0));
+       &mov    ($rounds_,"esp");
+       &mov    ($out,&wparam(1));
+       &sub    ($rounds_,24);
+       &mov    ($len,&wparam(2));
+       &and    ($rounds_,-16);
+       &mov    ($key,&wparam(3));
+       &mov    ($key_,&wparam(4));
+       &test   ($len,$len);
+       &jz     (&label("cbc_abort"));
+
+       &cmp    (&wparam(5),0);
+       &xchg   ($rounds_,"esp");               # alloca
+       &movups ($ivec,&QWP(0,$key_));          # load IV
+       &mov    ($rounds,&DWP(240,$key));
+       &mov    ($key_,$key);                   # backup $key
+       &mov    (&DWP(16,"esp"),$rounds_);      # save original %esp
+       &mov    ($rounds_,$rounds);             # backup $rounds
+       &je     (&label("cbc_decrypt"));
+
+       &movaps ($inout0,$ivec);
+       &cmp    ($len,16);
+       &jb     (&label("cbc_enc_tail"));
+       &sub    ($len,16);
+       &jmp    (&label("cbc_enc_loop"));
+
+&set_label("cbc_enc_loop",16);
+       &movups ($ivec,&QWP(0,$inp));           # input actually
+       &lea    ($inp,&DWP(16,$inp));
+       if ($inline)
+       {   &aesni_inline_generate1("enc",$inout0,$ivec);       }
+       else
+       {   &xorps($inout0,$ivec); &call("_aesni_encrypt1");    }
+       &mov    ($rounds,$rounds_);     # restore $rounds
+       &mov    ($key,$key_);           # restore $key
+       &movups (&QWP(0,$out),$inout0); # store output
+       &lea    ($out,&DWP(16,$out));
+       &sub    ($len,16);
+       &jnc    (&label("cbc_enc_loop"));
+       &add    ($len,16);
+       &jnz    (&label("cbc_enc_tail"));
+       &movaps ($ivec,$inout0);
+       &jmp    (&label("cbc_ret"));
+
+&set_label("cbc_enc_tail");
+       &mov    ("ecx",$len);           # zaps $rounds
+       &data_word(0xA4F3F689);         # rep movsb
+       &mov    ("ecx",16);             # zero tail
+       &sub    ("ecx",$len);
+       &xor    ("eax","eax");          # zaps $len
+       &data_word(0xAAF3F689);         # rep stosb
+       &lea    ($out,&DWP(-16,$out));  # rewind $out by 1 block
+       &mov    ($rounds,$rounds_);     # restore $rounds
+       &mov    ($inp,$out);            # $inp and $out are the same
+       &mov    ($key,$key_);           # restore $key
+       &jmp    (&label("cbc_enc_loop"));
+######################################################################
+&set_label("cbc_decrypt",16);
+       &cmp    ($len,0x50);
+       &jbe    (&label("cbc_dec_tail"));
+       &movaps (&QWP(0,"esp"),$ivec);          # save IV
+       &sub    ($len,0x50);
+       &jmp    (&label("cbc_dec_loop6_enter"));
+
+&set_label("cbc_dec_loop6",16);
+       &movaps (&QWP(0,"esp"),$rndkey0);       # save IV
+       &movups (&QWP(0,$out),$inout5);
+       &lea    ($out,&DWP(0x10,$out));
+&set_label("cbc_dec_loop6_enter");
+       &movdqu ($inout0,&QWP(0,$inp));
+       &movdqu ($inout1,&QWP(0x10,$inp));
+       &movdqu ($inout2,&QWP(0x20,$inp));
+       &movdqu ($inout3,&QWP(0x30,$inp));
+       &movdqu ($inout4,&QWP(0x40,$inp));
+       &movdqu ($inout5,&QWP(0x50,$inp));
+
+       &call   ("_aesni_decrypt6");
+
+       &movups ($rndkey1,&QWP(0,$inp));
+       &movups ($rndkey0,&QWP(0x10,$inp));
+       &xorps  ($inout0,&QWP(0,"esp"));        # ^=IV
+       &xorps  ($inout1,$rndkey1);
+       &movups ($rndkey1,&QWP(0x20,$inp));
+       &xorps  ($inout2,$rndkey0);
+       &movups ($rndkey0,&QWP(0x30,$inp));
+       &xorps  ($inout3,$rndkey1);
+       &movups ($rndkey1,&QWP(0x40,$inp));
+       &xorps  ($inout4,$rndkey0);
+       &movups ($rndkey0,&QWP(0x50,$inp));     # IV
+       &xorps  ($inout5,$rndkey1);
+       &movups (&QWP(0,$out),$inout0);
+       &movups (&QWP(0x10,$out),$inout1);
+       &lea    ($inp,&DWP(0x60,$inp));
+       &movups (&QWP(0x20,$out),$inout2);
+       &mov    ($rounds,$rounds_)              # restore $rounds
+       &movups (&QWP(0x30,$out),$inout3);
+       &mov    ($key,$key_);                   # restore $key
+       &movups (&QWP(0x40,$out),$inout4);
+       &lea    ($out,&DWP(0x50,$out));
+       &sub    ($len,0x60);
+       &ja     (&label("cbc_dec_loop6"));
+
+       &movaps ($inout0,$inout5);
+       &movaps ($ivec,$rndkey0);
+       &add    ($len,0x50);
+       &jle    (&label("cbc_dec_tail_collected"));
+       &movups (&QWP(0,$out),$inout0);
+       &lea    ($out,&DWP(0x10,$out));
+&set_label("cbc_dec_tail");
+       &movups ($inout0,&QWP(0,$inp));
+       &movaps ($in0,$inout0);
+       &cmp    ($len,0x10);
+       &jbe    (&label("cbc_dec_one"));
+
+       &movups ($inout1,&QWP(0x10,$inp));
+       &movaps ($in1,$inout1);
+       &cmp    ($len,0x20);
+       &jbe    (&label("cbc_dec_two"));
+
+       &movups ($inout2,&QWP(0x20,$inp));
+       &cmp    ($len,0x30);
+       &jbe    (&label("cbc_dec_three"));
+
+       &movups ($inout3,&QWP(0x30,$inp));
+       &cmp    ($len,0x40);
+       &jbe    (&label("cbc_dec_four"));
+
+       &movups ($inout4,&QWP(0x40,$inp));
+       &movaps (&QWP(0,"esp"),$ivec);          # save IV
+       &movups ($inout0,&QWP(0,$inp));
+       &xorps  ($inout5,$inout5);
+       &call   ("_aesni_decrypt6");
+       &movups ($rndkey1,&QWP(0,$inp));
+       &movups ($rndkey0,&QWP(0x10,$inp));
+       &xorps  ($inout0,&QWP(0,"esp"));        # ^= IV
+       &xorps  ($inout1,$rndkey1);
+       &movups ($rndkey1,&QWP(0x20,$inp));
+       &xorps  ($inout2,$rndkey0);
+       &movups ($rndkey0,&QWP(0x30,$inp));
+       &xorps  ($inout3,$rndkey1);
+       &movups ($ivec,&QWP(0x40,$inp));        # IV
+       &xorps  ($inout4,$rndkey0);
+       &movups (&QWP(0,$out),$inout0);
+       &movups (&QWP(0x10,$out),$inout1);
+       &movups (&QWP(0x20,$out),$inout2);
+       &movups (&QWP(0x30,$out),$inout3);
+       &lea    ($out,&DWP(0x40,$out));
+       &movaps ($inout0,$inout4);
+       &sub    ($len,0x50);
+       &jmp    (&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_one",16);
+       if ($inline)
+       {   &aesni_inline_generate1("dec");     }
+       else
+       {   &call       ("_aesni_decrypt1");    }
+       &xorps  ($inout0,$ivec);
+       &movaps ($ivec,$in0);
+       &sub    ($len,0x10);
+       &jmp    (&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_two",16);
+       &xorps  ($inout2,$inout2);
+       &call   ("_aesni_decrypt3");
+       &xorps  ($inout0,$ivec);
+       &xorps  ($inout1,$in0);
+       &movups (&QWP(0,$out),$inout0);
+       &movaps ($inout0,$inout1);
+       &lea    ($out,&DWP(0x10,$out));
+       &movaps ($ivec,$in1);
+       &sub    ($len,0x20);
+       &jmp    (&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_three",16);
+       &call   ("_aesni_decrypt3");
+       &xorps  ($inout0,$ivec);
+       &xorps  ($inout1,$in0);
+       &xorps  ($inout2,$in1);
+       &movups (&QWP(0,$out),$inout0);
+       &movaps ($inout0,$inout2);
+       &movups (&QWP(0x10,$out),$inout1);
+       &lea    ($out,&DWP(0x20,$out));
+       &movups ($ivec,&QWP(0x20,$inp));
+       &sub    ($len,0x30);
+       &jmp    (&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_four",16);
+       &call   ("_aesni_decrypt4");
+       &movups ($rndkey1,&QWP(0x10,$inp));
+       &movups ($rndkey0,&QWP(0x20,$inp));
+       &xorps  ($inout0,$ivec);
+       &movups ($ivec,&QWP(0x30,$inp));
+       &xorps  ($inout1,$in0);
+       &movups (&QWP(0,$out),$inout0);
+       &xorps  ($inout2,$rndkey1);
+       &movups (&QWP(0x10,$out),$inout1);
+       &xorps  ($inout3,$rndkey0);
+       &movups (&QWP(0x20,$out),$inout2);
+       &lea    ($out,&DWP(0x30,$out));
+       &movaps ($inout0,$inout3);
+       &sub    ($len,0x40);
+
+&set_label("cbc_dec_tail_collected");
+       &and    ($len,15);
+       &jnz    (&label("cbc_dec_tail_partial"));
+       &movups (&QWP(0,$out),$inout0);
+       &jmp    (&label("cbc_ret"));
+
+&set_label("cbc_dec_tail_partial",16);
+       &movaps (&QWP(0,"esp"),$inout0);
+       &mov    ("ecx",16);
+       &mov    ($inp,"esp");
+       &sub    ("ecx",$len);
+       &data_word(0xA4F3F689);         # rep movsb
+
+&set_label("cbc_ret");
+       &mov    ("esp",&DWP(16,"esp")); # pull original %esp
+       &mov    ($key_,&wparam(4));
+       &movups (&QWP(0,$key_),$ivec);  # output IV
+&set_label("cbc_abort");
+&function_end("${PREFIX}_cbc_encrypt");
+
+######################################################################
+# Mechanical port from aesni-x86_64.pl.
+#
+# _aesni_set_encrypt_key is private interface,
+# input:
+#      "eax"   const unsigned char *userKey
+#      $rounds int bits
+#      $key    AES_KEY *key
+# output:
+#      "eax"   return code
+#      $round  rounds
+
+&function_begin_B("_aesni_set_encrypt_key");
+       &test   ("eax","eax");
+       &jz     (&label("bad_pointer"));
+       &test   ($key,$key);
+       &jz     (&label("bad_pointer"));
+
+       &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
+       &xorps  ("xmm4","xmm4");        # low dword of xmm4 is assumed 0
+       &lea    ($key,&DWP(16,$key));
+       &cmp    ($rounds,256);
+       &je     (&label("14rounds"));
+       &cmp    ($rounds,192);
+       &je     (&label("12rounds"));
+       &cmp    ($rounds,128);
+       &jne    (&label("bad_keybits"));
+
+&set_label("10rounds",16);
+       &mov            ($rounds,9);
+       &$movekey       (&QWP(-16,$key),"xmm0");        # round 0
+       &aeskeygenassist("xmm1","xmm0",0x01);           # round 1
+       &call           (&label("key_128_cold"));
+       &aeskeygenassist("xmm1","xmm0",0x2);            # round 2
+       &call           (&label("key_128"));
+       &aeskeygenassist("xmm1","xmm0",0x04);           # round 3
+       &call           (&label("key_128"));
+       &aeskeygenassist("xmm1","xmm0",0x08);           # round 4
+       &call           (&label("key_128"));
+       &aeskeygenassist("xmm1","xmm0",0x10);           # round 5
+       &call           (&label("key_128"));
+       &aeskeygenassist("xmm1","xmm0",0x20);           # round 6
+       &call           (&label("key_128"));
+       &aeskeygenassist("xmm1","xmm0",0x40);           # round 7
+       &call           (&label("key_128"));
+       &aeskeygenassist("xmm1","xmm0",0x80);           # round 8
+       &call           (&label("key_128"));
+       &aeskeygenassist("xmm1","xmm0",0x1b);           # round 9
+       &call           (&label("key_128"));
+       &aeskeygenassist("xmm1","xmm0",0x36);           # round 10
+       &call           (&label("key_128"));
+       &$movekey       (&QWP(0,$key),"xmm0");
+       &mov            (&DWP(80,$key),$rounds);
+       &xor            ("eax","eax");
+       &ret();
+
+&set_label("key_128",16);
+       &$movekey       (&QWP(0,$key),"xmm0");
+       &lea            ($key,&DWP(16,$key));
+&set_label("key_128_cold");
+       &shufps         ("xmm4","xmm0",0b00010000);
+       &xorps          ("xmm0","xmm4");
+       &shufps         ("xmm4","xmm0",0b10001100);
+       &xorps          ("xmm0","xmm4");
+       &shufps         ("xmm1","xmm1",0b11111111);     # critical path
+       &xorps          ("xmm0","xmm1");
+       &ret();
+
+&set_label("12rounds",16);
+       &movq           ("xmm2",&QWP(16,"eax"));        # remaining 1/3 of 
*userKey
+       &mov            ($rounds,11);
+       &$movekey       (&QWP(-16,$key),"xmm0")         # round 0
+       &aeskeygenassist("xmm1","xmm2",0x01);           # round 1,2
+       &call           (&label("key_192a_cold"));
+       &aeskeygenassist("xmm1","xmm2",0x02);           # round 2,3
+       &call           (&label("key_192b"));
+       &aeskeygenassist("xmm1","xmm2",0x04);           # round 4,5
+       &call           (&label("key_192a"));
+       &aeskeygenassist("xmm1","xmm2",0x08);           # round 5,6
+       &call           (&label("key_192b"));
+       &aeskeygenassist("xmm1","xmm2",0x10);           # round 7,8
+       &call           (&label("key_192a"));
+       &aeskeygenassist("xmm1","xmm2",0x20);           # round 8,9
+       &call           (&label("key_192b"));
+       &aeskeygenassist("xmm1","xmm2",0x40);           # round 10,11
+       &call           (&label("key_192a"));
+       &aeskeygenassist("xmm1","xmm2",0x80);           # round 11,12
+       &call           (&label("key_192b"));
+       &$movekey       (&QWP(0,$key),"xmm0");
+       &mov            (&DWP(48,$key),$rounds);
+       &xor            ("eax","eax");
+       &ret();
+
+&set_label("key_192a",16);
+       &$movekey       (&QWP(0,$key),"xmm0");
+       &lea            ($key,&DWP(16,$key));
+&set_label("key_192a_cold",16);
+       &movaps         ("xmm5","xmm2");
+&set_label("key_192b_warm");
+       &shufps         ("xmm4","xmm0",0b00010000);
+       &movdqa         ("xmm3","xmm2");
+       &xorps          ("xmm0","xmm4");
+       &shufps         ("xmm4","xmm0",0b10001100);
+       &pslldq         ("xmm3",4);
+       &xorps          ("xmm0","xmm4");
+       &pshufd         ("xmm1","xmm1",0b01010101);     # critical path
+       &pxor           ("xmm2","xmm3");
+       &pxor           ("xmm0","xmm1");
+       &pshufd         ("xmm3","xmm0",0b11111111);
+       &pxor           ("xmm2","xmm3");
+       &ret();
+
+&set_label("key_192b",16);
+       &movaps         ("xmm3","xmm0");
+       &shufps         ("xmm5","xmm0",0b01000100);
+       &$movekey       (&QWP(0,$key),"xmm5");
+       &shufps         ("xmm3","xmm2",0b01001110);
+       &$movekey       (&QWP(16,$key),"xmm3");
+       &lea            ($key,&DWP(32,$key));
+       &jmp            (&label("key_192b_warm"));
+
+&set_label("14rounds",16);
+       &movups         ("xmm2",&QWP(16,"eax"));        # remaining half of 
*userKey
+       &mov            ($rounds,13);
+       &lea            ($key,&DWP(16,$key));
+       &$movekey       (&QWP(-32,$key),"xmm0");        # round 0
+       &$movekey       (&QWP(-16,$key),"xmm2");        # round 1
+       &aeskeygenassist("xmm1","xmm2",0x01);           # round 2
+       &call           (&label("key_256a_cold"));
+       &aeskeygenassist("xmm1","xmm0",0x01);           # round 3
+       &call           (&label("key_256b"));
+       &aeskeygenassist("xmm1","xmm2",0x02);           # round 4
+       &call           (&label("key_256a"));
+       &aeskeygenassist("xmm1","xmm0",0x02);           # round 5
+       &call           (&label("key_256b"));
+       &aeskeygenassist("xmm1","xmm2",0x04);           # round 6
+       &call           (&label("key_256a"));
+       &aeskeygenassist("xmm1","xmm0",0x04);           # round 7
+       &call           (&label("key_256b"));
+       &aeskeygenassist("xmm1","xmm2",0x08);           # round 8
+       &call           (&label("key_256a"));
+       &aeskeygenassist("xmm1","xmm0",0x08);           # round 9
+       &call           (&label("key_256b"));
+       &aeskeygenassist("xmm1","xmm2",0x10);           # round 10
+       &call           (&label("key_256a"));
+       &aeskeygenassist("xmm1","xmm0",0x10);           # round 11
+       &call           (&label("key_256b"));
+       &aeskeygenassist("xmm1","xmm2",0x20);           # round 12
+       &call           (&label("key_256a"));
+       &aeskeygenassist("xmm1","xmm0",0x20);           # round 13
+       &call           (&label("key_256b"));
+       &aeskeygenassist("xmm1","xmm2",0x40);           # round 14
+       &call           (&label("key_256a"));
+       &$movekey       (&QWP(0,$key),"xmm0");
+       &mov            (&DWP(16,$key),$rounds);
+       &xor            ("eax","eax");
+       &ret();
+
+&set_label("key_256a",16);
+       &$movekey       (&QWP(0,$key),"xmm2");
+       &lea            ($key,&DWP(16,$key));
+&set_label("key_256a_cold");
+       &shufps         ("xmm4","xmm0",0b00010000);
+       &xorps          ("xmm0","xmm4");
+       &shufps         ("xmm4","xmm0",0b10001100);
+       &xorps          ("xmm0","xmm4");
+       &shufps         ("xmm1","xmm1",0b11111111);     # critical path
+       &xorps          ("xmm0","xmm1");
+       &ret();
+
+&set_label("key_256b",16);
+       &$movekey       (&QWP(0,$key),"xmm0");
+       &lea            ($key,&DWP(16,$key));
+
+       &shufps         ("xmm4","xmm2",0b00010000);
+       &xorps          ("xmm2","xmm4");
+       &shufps         ("xmm4","xmm2",0b10001100);
+       &xorps          ("xmm2","xmm4");
+       &shufps         ("xmm1","xmm1",0b10101010);     # critical path
+       &xorps          ("xmm2","xmm1");
+       &ret();
+
+&set_label("bad_pointer",4);
+       &mov    ("eax",-1);
+       &ret    ();
+&set_label("bad_keybits",4);
+       &mov    ("eax",-2);
+       &ret    ();
+&function_end_B("_aesni_set_encrypt_key");
+
+# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
+#                              AES_KEY *key)
+&function_begin_B("${PREFIX}_set_encrypt_key");
+       &mov    ("eax",&wparam(0));
+       &mov    ($rounds,&wparam(1));
+       &mov    ($key,&wparam(2));
+       &call   ("_aesni_set_encrypt_key");
+       &ret    ();
+&function_end_B("${PREFIX}_set_encrypt_key");
+
+# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
+#                              AES_KEY *key)
+&function_begin_B("${PREFIX}_set_decrypt_key");
+       &mov    ("eax",&wparam(0));
+       &mov    ($rounds,&wparam(1));
+       &mov    ($key,&wparam(2));
+       &call   ("_aesni_set_encrypt_key");
+       &mov    ($key,&wparam(2));
+       &shl    ($rounds,4)     # rounds-1 after _aesni_set_encrypt_key
+       &test   ("eax","eax");
+       &jnz    (&label("dec_key_ret"));
+       &lea    ("eax",&DWP(16,$key,$rounds));  # end of key schedule
+
+       &$movekey       ("xmm0",&QWP(0,$key));  # just swap
+       &$movekey       ("xmm1",&QWP(0,"eax"));
+       &$movekey       (&QWP(0,"eax"),"xmm0");
+       &$movekey       (&QWP(0,$key),"xmm1");
+       &lea            ($key,&DWP(16,$key));
+       &lea            ("eax",&DWP(-16,"eax"));
+
+&set_label("dec_key_inverse");
+       &$movekey       ("xmm0",&QWP(0,$key));  # swap and inverse
+       &$movekey       ("xmm1",&QWP(0,"eax"));
+       &aesimc         ("xmm0","xmm0");
+       &aesimc         ("xmm1","xmm1");
+       &lea            ($key,&DWP(16,$key));
+       &lea            ("eax",&DWP(-16,"eax"));
+       &$movekey       (&QWP(16,"eax"),"xmm0");
+       &$movekey       (&QWP(-16,$key),"xmm1");
+       &cmp            ("eax",$key);
+       &ja             (&label("dec_key_inverse"));
+
+       &$movekey       ("xmm0",&QWP(0,$key));  # inverse middle
+       &aesimc         ("xmm0","xmm0");
+       &$movekey       (&QWP(0,$key),"xmm0");
+
+       &xor            ("eax","eax");          # return success
+&set_label("dec_key_ret");
+       &ret    ();
+&function_end_B("${PREFIX}_set_decrypt_key");
+&asciz("AES for Intel AES-NI, CRYPTOGAMS by <address@hidden>");
+
+&asm_finish();
diff --git a/devel/perlasm/aesni-x86_64.pl b/devel/perlasm/aesni-x86_64.pl
new file mode 100644
index 0000000..499f3b3
--- /dev/null
+++ b/devel/perlasm/aesni-x86_64.pl
@@ -0,0 +1,3068 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <address@hidden> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
+# details].
+#
+# Performance.
+#
+# Given aes(enc|dec) instructions' latency asymptotic performance for
+# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
+# processed with 128-bit key. And given their throughput asymptotic
+# performance for parallelizable modes is 1.25 cycles per byte. Being
+# asymptotic limit it's not something you commonly achieve in reality,
+# but how close does one get? Below are results collected for
+# different modes and block sized. Pairs of numbers are for en-/
+# decryption.
+#
+#      16-byte     64-byte     256-byte    1-KB        8-KB
+# ECB  4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26   1.26/1.26
+# CTR  5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
+# CBC  4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
+# CCM  5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07   
+# OFB  5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
+# CFB  5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
+#
+# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
+# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
+# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
+# The results were collected with specially crafted speed.c benchmark
+# in order to compare them with results reported in "Intel Advanced
+# Encryption Standard (AES) New Instruction Set" White Paper Revision
+# 3.0 dated May 2010. All above results are consistently better. This
+# module also provides better performance for block sizes smaller than
+# 128 bytes in points *not* represented in the above table.
+#
+# Looking at the results for 8-KB buffer.
+#
+# CFB and OFB results are far from the limit, because implementation
+# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
+# single-block aesni_encrypt, which is not the most optimal way to go.
+# CBC encrypt result is unexpectedly high and there is no documented
+# explanation for it. Seemingly there is a small penalty for feeding
+# the result back to AES unit the way it's done in CBC mode. There is
+# nothing one can do and the result appears optimal. CCM result is
+# identical to CBC, because CBC-MAC is essentially CBC encrypt without
+# saving output. CCM CTR "stays invisible," because it's neatly
+# interleaved wih CBC-MAC. This provides ~30% improvement over
+# "straghtforward" CCM implementation with CTR and CBC-MAC performed
+# disjointly. Parallelizable modes practically achieve the theoretical
+# limit.
+#
+# Looking at how results vary with buffer size.
+#
+# Curves are practically saturated at 1-KB buffer size. In most cases
+# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
+# CTR curve doesn't follow this pattern and is "slowest" changing one
+# with "256-byte" result being 87% of "8-KB." This is because overhead
+# in CTR mode is most computationally intensive. Small-block CCM
+# decrypt is slower than encrypt, because first CTR and last CBC-MAC
+# iterations can't be interleaved.
+#
+# Results for 192- and 256-bit keys.
+#
+# EVP-free results were observed to scale perfectly with number of
+# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
+# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
+# are a tad smaller, because the above mentioned penalty biases all
+# results by same constant value. In similar way function call
+# overhead affects small-block performance, as well as OFB and CFB
+# results. Differences are not large, most common coefficients are
+# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
+# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
+
+# January 2011
+#
+# While Westmere processor features 6 cycles latency for aes[enc|dec]
+# instructions, which can be scheduled every second cycle, Sandy
+# Bridge spends 8 cycles per instruction, but it can schedule them
+# every cycle. This means that code targeting Westmere would perform
+# suboptimally on Sandy Bridge. Therefore this update.
+#
+# In addition, non-parallelizable CBC encrypt (as well as CCM) is
+# optimized. Relative improvement might appear modest, 8% on Westmere,
+# but in absolute terms it's 3.77 cycles per byte encrypted with
+# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
+# should be compared to asymptotic limits of 3.75 for Westmere and
+# 5.00 for Sandy Bridge. Actually, the fact that they get this close
+# to asymptotic limits is quite amazing. Indeed, the limit is
+# calculated as latency times number of rounds, 10 for 128-bit key,
+# and divided by 16, the number of bytes in block, or in other words
+# it accounts *solely* for aesenc instructions. But there are extra
+# instructions, and numbers so close to the asymptotic limits mean
+# that it's as if it takes as little as *one* additional cycle to
+# execute all of them. How is it possible? It is possible thanks to
+# out-of-order execution logic, which manages to overlap post-
+# processing of previous block, things like saving the output, with
+# actual encryption of current block, as well as pre-processing of
+# current block, things like fetching input and xor-ing it with
+# 0-round element of the key schedule, with actual encryption of
+# previous block. Keep this in mind...
+#
+# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
+# performance is achieved by interleaving instructions working on
+# independent blocks. In which case asymptotic limit for such modes
+# can be obtained by dividing above mentioned numbers by AES
+# instructions' interleave factor. Westmere can execute at most 3 
+# instructions at a time, meaning that optimal interleave factor is 3,
+# and that's where the "magic" number of 1.25 come from. "Optimal
+# interleave factor" means that increase of interleave factor does
+# not improve performance. The formula has proven to reflect reality
+# pretty well on Westmere... Sandy Bridge on the other hand can
+# execute up to 8 AES instructions at a time, so how does varying
+# interleave factor affect the performance? Here is table for ECB
+# (numbers are cycles per byte processed with 128-bit key):
+#
+# instruction interleave factor                3x      6x      8x
+# theoretical asymptotic limit         1.67    0.83    0.625
+# measured performance for 8KB block   1.05    0.86    0.84
+#
+# "as if" interleave factor            4.7x    5.8x    6.0x
+#
+# Further data for other parallelizable modes:
+#
+# CBC decrypt                          1.16    0.93    0.93
+# CTR                                  1.14    0.91    n/a
+#
+# Well, given 3x column it's probably inappropriate to call the limit
+# asymptotic, if it can be surpassed, isn't it? What happens there?
+# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
+# magic is responsible for this. Processor overlaps not only the
+# additional instructions with AES ones, but even AES instuctions
+# processing adjacent triplets of independent blocks. In the 6x case
+# additional instructions  still claim disproportionally small amount
+# of additional cycles, but in 8x case number of instructions must be
+# a tad too high for out-of-order logic to cope with, and AES unit
+# remains underutilized... As you can see 8x interleave is hardly
+# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
+# utilizies 6x interleave because of limited register bank capacity.
+#
+# Higher interleave factors do have negative impact on Westmere
+# performance. While for ECB mode it's negligible ~1.5%, other
+# parallelizables perform ~5% worse, which is outweighed by ~25%
+# improvement on Sandy Bridge. To balance regression on Westmere
+# CTR mode was implemented with 6x aesenc interleave factor.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
+# in CTR mode AES instruction interleave factor was chosen to be 6x.
+
+$PREFIX="aesni";       # if $PREFIX is set to "AES", the script
+                       # generates drop-in replacement for
+                       # crypto/aes/asm/aes-x86_64.pl:-)
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
address@hidden  ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
+               ("%rdi","%rsi","%rdx","%rcx");  # Unix order
+
+$code=".text\n";
+
+$rounds="%eax";        # input to and changed by aesni_[en|de]cryptN !!!
+# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
+$inp="%rdi";
+$out="%rsi";
+$len="%rdx";
+$key="%rcx";   # input to and changed by aesni_[en|de]cryptN !!!
+$ivp="%r8";    # cbc, ctr, ...
+
+$rnds_="%r10d";        # backup copy for $rounds
+$key_="%r11";  # backup copy for $key
+
+# %xmm register layout
+$rndkey0="%xmm0";      $rndkey1="%xmm1";
+$inout0="%xmm2";       $inout1="%xmm3";
+$inout2="%xmm4";       $inout3="%xmm5";
+$inout4="%xmm6";       $inout5="%xmm7";
+$inout6="%xmm8";       $inout7="%xmm9";
+
+$in2="%xmm6";          $in1="%xmm7";   # used in CBC decrypt, CTR, ...
+$in0="%xmm8";          $iv="%xmm9";
+
+# Inline version of internal aesni_[en|de]crypt1.
+#
+# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
+# cycles which take care of loop variables...
+{ my $sn;
+sub aesni_generate1 {
+my ($p,$key,$rounds,$inout,$ivec)address@hidden;       $inout=$inout0 if 
(!defined($inout));
+++$sn;
+$code.=<<___;
+       $movkey ($key),$rndkey0
+       $movkey 16($key),$rndkey1
+___
+$code.=<<___ if (defined($ivec));
+       xorps   $rndkey0,$ivec
+       lea     32($key),$key
+       xorps   $ivec,$inout
+___
+$code.=<<___ if (!defined($ivec));
+       lea     32($key),$key
+       xorps   $rndkey0,$inout
+___
+$code.=<<___;
+.Loop_${p}1_$sn:
+       aes${p} $rndkey1,$inout
+       dec     $rounds
+       $movkey ($key),$rndkey1
+       lea     16($key),$key
+       jnz     .Loop_${p}1_$sn # loop body is 16 bytes
+       aes${p}last     $rndkey1,$inout
+___
+}}
+# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
+#
+{ my ($inp,$out,$key) = @_4args;
+
+$code.=<<___;
+.globl ${PREFIX}_encrypt
+.type  ${PREFIX}_encrypt,address@hidden
+.align 16
+${PREFIX}_encrypt:
+       movups  ($inp),$inout0          # load input
+       mov     240($key),$rounds       # key->rounds
+___
+       &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+       movups  $inout0,($out)          # output
+       ret
+.size  ${PREFIX}_encrypt,.-${PREFIX}_encrypt
+
+.globl ${PREFIX}_decrypt
+.type  ${PREFIX}_decrypt,address@hidden
+.align 16
+${PREFIX}_decrypt:
+       movups  ($inp),$inout0          # load input
+       mov     240($key),$rounds       # key->rounds
+___
+       &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+       movups  $inout0,($out)          # output
+       ret
+.size  ${PREFIX}_decrypt, .-${PREFIX}_decrypt
+___
+}
+
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it makes no sense to implement 2x subroutine.
+# aes[enc|dec] latency in next processor generation is 8, but the
+# instructions can be scheduled every cycle. Optimal interleave for
+# new processor is therefore 8x...
+sub aesni_generate3 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-2] is cipher/clear text...
+$code.=<<___;
+.type  _aesni_${dir}rypt3,address@hidden
+.align 16
+_aesni_${dir}rypt3:
+       $movkey ($key),$rndkey0
+       shr     \$1,$rounds
+       $movkey 16($key),$rndkey1
+       lea     32($key),$key
+       xorps   $rndkey0,$inout0
+       xorps   $rndkey0,$inout1
+       xorps   $rndkey0,$inout2
+       $movkey         ($key),$rndkey0
+
+.L${dir}_loop3:
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+       dec             $rounds
+       aes${dir}       $rndkey1,$inout2
+       $movkey         16($key),$rndkey1
+       aes${dir}       $rndkey0,$inout0
+       aes${dir}       $rndkey0,$inout1
+       lea             32($key),$key
+       aes${dir}       $rndkey0,$inout2
+       $movkey         ($key),$rndkey0
+       jnz             .L${dir}_loop3
+
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+       aes${dir}       $rndkey1,$inout2
+       aes${dir}last   $rndkey0,$inout0
+       aes${dir}last   $rndkey0,$inout1
+       aes${dir}last   $rndkey0,$inout2
+       ret
+.size  _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
+___
+}
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement would be <20%,
+# so it's not worth it...
+sub aesni_generate4 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-3] is cipher/clear text...
+$code.=<<___;
+.type  _aesni_${dir}rypt4,address@hidden
+.align 16
+_aesni_${dir}rypt4:
+       $movkey ($key),$rndkey0
+       shr     \$1,$rounds
+       $movkey 16($key),$rndkey1
+       lea     32($key),$key
+       xorps   $rndkey0,$inout0
+       xorps   $rndkey0,$inout1
+       xorps   $rndkey0,$inout2
+       xorps   $rndkey0,$inout3
+       $movkey ($key),$rndkey0
+
+.L${dir}_loop4:
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+       dec             $rounds
+       aes${dir}       $rndkey1,$inout2
+       aes${dir}       $rndkey1,$inout3
+       $movkey         16($key),$rndkey1
+       aes${dir}       $rndkey0,$inout0
+       aes${dir}       $rndkey0,$inout1
+       lea             32($key),$key
+       aes${dir}       $rndkey0,$inout2
+       aes${dir}       $rndkey0,$inout3
+       $movkey         ($key),$rndkey0
+       jnz             .L${dir}_loop4
+
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+       aes${dir}       $rndkey1,$inout2
+       aes${dir}       $rndkey1,$inout3
+       aes${dir}last   $rndkey0,$inout0
+       aes${dir}last   $rndkey0,$inout1
+       aes${dir}last   $rndkey0,$inout2
+       aes${dir}last   $rndkey0,$inout3
+       ret
+.size  _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
+___
+}
+sub aesni_generate6 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-5] is cipher/clear text...
+$code.=<<___;
+.type  _aesni_${dir}rypt6,address@hidden
+.align 16
+_aesni_${dir}rypt6:
+       $movkey         ($key),$rndkey0
+       shr             \$1,$rounds
+       $movkey         16($key),$rndkey1
+       lea             32($key),$key
+       xorps           $rndkey0,$inout0
+       pxor            $rndkey0,$inout1
+       aes${dir}       $rndkey1,$inout0
+       pxor            $rndkey0,$inout2
+       aes${dir}       $rndkey1,$inout1
+       pxor            $rndkey0,$inout3
+       aes${dir}       $rndkey1,$inout2
+       pxor            $rndkey0,$inout4
+       aes${dir}       $rndkey1,$inout3
+       pxor            $rndkey0,$inout5
+       dec             $rounds
+       aes${dir}       $rndkey1,$inout4
+       $movkey         ($key),$rndkey0
+       aes${dir}       $rndkey1,$inout5
+       jmp             .L${dir}_loop6_enter
+.align 16
+.L${dir}_loop6:
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+       dec             $rounds
+       aes${dir}       $rndkey1,$inout2
+       aes${dir}       $rndkey1,$inout3
+       aes${dir}       $rndkey1,$inout4
+       aes${dir}       $rndkey1,$inout5
+.L${dir}_loop6_enter:                          # happens to be 16-byte aligned
+       $movkey         16($key),$rndkey1
+       aes${dir}       $rndkey0,$inout0
+       aes${dir}       $rndkey0,$inout1
+       lea             32($key),$key
+       aes${dir}       $rndkey0,$inout2
+       aes${dir}       $rndkey0,$inout3
+       aes${dir}       $rndkey0,$inout4
+       aes${dir}       $rndkey0,$inout5
+       $movkey         ($key),$rndkey0
+       jnz             .L${dir}_loop6
+
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+       aes${dir}       $rndkey1,$inout2
+       aes${dir}       $rndkey1,$inout3
+       aes${dir}       $rndkey1,$inout4
+       aes${dir}       $rndkey1,$inout5
+       aes${dir}last   $rndkey0,$inout0
+       aes${dir}last   $rndkey0,$inout1
+       aes${dir}last   $rndkey0,$inout2
+       aes${dir}last   $rndkey0,$inout3
+       aes${dir}last   $rndkey0,$inout4
+       aes${dir}last   $rndkey0,$inout5
+       ret
+.size  _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
+___
+}
+sub aesni_generate8 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-7] is cipher/clear text...
+$code.=<<___;
+.type  _aesni_${dir}rypt8,address@hidden
+.align 16
+_aesni_${dir}rypt8:
+       $movkey         ($key),$rndkey0
+       shr             \$1,$rounds
+       $movkey         16($key),$rndkey1
+       lea             32($key),$key
+       xorps           $rndkey0,$inout0
+       xorps           $rndkey0,$inout1
+       aes${dir}       $rndkey1,$inout0
+       pxor            $rndkey0,$inout2
+       aes${dir}       $rndkey1,$inout1
+       pxor            $rndkey0,$inout3
+       aes${dir}       $rndkey1,$inout2
+       pxor            $rndkey0,$inout4
+       aes${dir}       $rndkey1,$inout3
+       pxor            $rndkey0,$inout5
+       dec             $rounds
+       aes${dir}       $rndkey1,$inout4
+       pxor            $rndkey0,$inout6
+       aes${dir}       $rndkey1,$inout5
+       pxor            $rndkey0,$inout7
+       $movkey         ($key),$rndkey0
+       aes${dir}       $rndkey1,$inout6
+       aes${dir}       $rndkey1,$inout7
+       $movkey         16($key),$rndkey1
+       jmp             .L${dir}_loop8_enter
+.align 16
+.L${dir}_loop8:
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+       dec             $rounds
+       aes${dir}       $rndkey1,$inout2
+       aes${dir}       $rndkey1,$inout3
+       aes${dir}       $rndkey1,$inout4
+       aes${dir}       $rndkey1,$inout5
+       aes${dir}       $rndkey1,$inout6
+       aes${dir}       $rndkey1,$inout7
+       $movkey         16($key),$rndkey1
+.L${dir}_loop8_enter:                          # happens to be 16-byte aligned
+       aes${dir}       $rndkey0,$inout0
+       aes${dir}       $rndkey0,$inout1
+       lea             32($key),$key
+       aes${dir}       $rndkey0,$inout2
+       aes${dir}       $rndkey0,$inout3
+       aes${dir}       $rndkey0,$inout4
+       aes${dir}       $rndkey0,$inout5
+       aes${dir}       $rndkey0,$inout6
+       aes${dir}       $rndkey0,$inout7
+       $movkey         ($key),$rndkey0
+       jnz             .L${dir}_loop8
+
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+       aes${dir}       $rndkey1,$inout2
+       aes${dir}       $rndkey1,$inout3
+       aes${dir}       $rndkey1,$inout4
+       aes${dir}       $rndkey1,$inout5
+       aes${dir}       $rndkey1,$inout6
+       aes${dir}       $rndkey1,$inout7
+       aes${dir}last   $rndkey0,$inout0
+       aes${dir}last   $rndkey0,$inout1
+       aes${dir}last   $rndkey0,$inout2
+       aes${dir}last   $rndkey0,$inout3
+       aes${dir}last   $rndkey0,$inout4
+       aes${dir}last   $rndkey0,$inout5
+       aes${dir}last   $rndkey0,$inout6
+       aes${dir}last   $rndkey0,$inout7
+       ret
+.size  _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
+___
+}
+&aesni_generate3("enc") if ($PREFIX eq "aesni");
+&aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
+&aesni_generate8("enc") if ($PREFIX eq "aesni");
+&aesni_generate8("dec");
+
+if ($PREFIX eq "aesni") {
+########################################################################
+# void aesni_ecb_encrypt (const void *in, void *out,
+#                        size_t length, const AES_KEY *key,
+#                        int enc);
+$code.=<<___;
+.globl aesni_ecb_encrypt
+.type  aesni_ecb_encrypt,address@hidden,5
+.align 16
+aesni_ecb_encrypt:
+       and     \$-16,$len
+       jz      .Lecb_ret
+
+       mov     240($key),$rounds       # key->rounds
+       $movkey ($key),$rndkey0
+       mov     $key,$key_              # backup $key
+       mov     $rounds,$rnds_          # backup $rounds
+       test    %r8d,%r8d               # 5th argument
+       jz      .Lecb_decrypt
+#--------------------------- ECB ENCRYPT ------------------------------#
+       cmp     \$0x80,$len
+       jb      .Lecb_enc_tail
+
+       movdqu  ($inp),$inout0
+       movdqu  0x10($inp),$inout1
+       movdqu  0x20($inp),$inout2
+       movdqu  0x30($inp),$inout3
+       movdqu  0x40($inp),$inout4
+       movdqu  0x50($inp),$inout5
+       movdqu  0x60($inp),$inout6
+       movdqu  0x70($inp),$inout7
+       lea     0x80($inp),$inp
+       sub     \$0x80,$len
+       jmp     .Lecb_enc_loop8_enter
+.align 16
+.Lecb_enc_loop8:
+       movups  $inout0,($out)
+       mov     $key_,$key              # restore $key
+       movdqu  ($inp),$inout0
+       mov     $rnds_,$rounds          # restore $rounds
+       movups  $inout1,0x10($out)
+       movdqu  0x10($inp),$inout1
+       movups  $inout2,0x20($out)
+       movdqu  0x20($inp),$inout2
+       movups  $inout3,0x30($out)
+       movdqu  0x30($inp),$inout3
+       movups  $inout4,0x40($out)
+       movdqu  0x40($inp),$inout4
+       movups  $inout5,0x50($out)
+       movdqu  0x50($inp),$inout5
+       movups  $inout6,0x60($out)
+       movdqu  0x60($inp),$inout6
+       movups  $inout7,0x70($out)
+       lea     0x80($out),$out
+       movdqu  0x70($inp),$inout7
+       lea     0x80($inp),$inp
+.Lecb_enc_loop8_enter:
+
+       call    _aesni_encrypt8
+
+       sub     \$0x80,$len
+       jnc     .Lecb_enc_loop8
+
+       movups  $inout0,($out)
+       mov     $key_,$key              # restore $key
+       movups  $inout1,0x10($out)
+       mov     $rnds_,$rounds          # restore $rounds
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       movups  $inout4,0x40($out)
+       movups  $inout5,0x50($out)
+       movups  $inout6,0x60($out)
+       movups  $inout7,0x70($out)
+       lea     0x80($out),$out
+       add     \$0x80,$len
+       jz      .Lecb_ret
+
+.Lecb_enc_tail:
+       movups  ($inp),$inout0
+       cmp     \$0x20,$len
+       jb      .Lecb_enc_one
+       movups  0x10($inp),$inout1
+       je      .Lecb_enc_two
+       movups  0x20($inp),$inout2
+       cmp     \$0x40,$len
+       jb      .Lecb_enc_three
+       movups  0x30($inp),$inout3
+       je      .Lecb_enc_four
+       movups  0x40($inp),$inout4
+       cmp     \$0x60,$len
+       jb      .Lecb_enc_five
+       movups  0x50($inp),$inout5
+       je      .Lecb_enc_six
+       movdqu  0x60($inp),$inout6
+       call    _aesni_encrypt8
+       movups  $inout0,($out)
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       movups  $inout4,0x40($out)
+       movups  $inout5,0x50($out)
+       movups  $inout6,0x60($out)
+       jmp     .Lecb_ret
+.align 16
+.Lecb_enc_one:
+___
+       &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+       movups  $inout0,($out)
+       jmp     .Lecb_ret
+.align 16
+.Lecb_enc_two:
+       xorps   $inout2,$inout2
+       call    _aesni_encrypt3
+       movups  $inout0,($out)
+       movups  $inout1,0x10($out)
+       jmp     .Lecb_ret
+.align 16
+.Lecb_enc_three:
+       call    _aesni_encrypt3
+       movups  $inout0,($out)
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       jmp     .Lecb_ret
+.align 16
+.Lecb_enc_four:
+       call    _aesni_encrypt4
+       movups  $inout0,($out)
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       jmp     .Lecb_ret
+.align 16
+.Lecb_enc_five:
+       xorps   $inout5,$inout5
+       call    _aesni_encrypt6
+       movups  $inout0,($out)
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       movups  $inout4,0x40($out)
+       jmp     .Lecb_ret
+.align 16
+.Lecb_enc_six:
+       call    _aesni_encrypt6
+       movups  $inout0,($out)
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       movups  $inout4,0x40($out)
+       movups  $inout5,0x50($out)
+       jmp     .Lecb_ret
+#--------------------------- ECB DECRYPT ------------------------------#
+.align 16
+.Lecb_decrypt:
+       cmp     \$0x80,$len
+       jb      .Lecb_dec_tail
+
+       movdqu  ($inp),$inout0
+       movdqu  0x10($inp),$inout1
+       movdqu  0x20($inp),$inout2
+       movdqu  0x30($inp),$inout3
+       movdqu  0x40($inp),$inout4
+       movdqu  0x50($inp),$inout5
+       movdqu  0x60($inp),$inout6
+       movdqu  0x70($inp),$inout7
+       lea     0x80($inp),$inp
+       sub     \$0x80,$len
+       jmp     .Lecb_dec_loop8_enter
+.align 16
+.Lecb_dec_loop8:
+       movups  $inout0,($out)
+       mov     $key_,$key              # restore $key
+       movdqu  ($inp),$inout0
+       mov     $rnds_,$rounds          # restore $rounds
+       movups  $inout1,0x10($out)
+       movdqu  0x10($inp),$inout1
+       movups  $inout2,0x20($out)
+       movdqu  0x20($inp),$inout2
+       movups  $inout3,0x30($out)
+       movdqu  0x30($inp),$inout3
+       movups  $inout4,0x40($out)
+       movdqu  0x40($inp),$inout4
+       movups  $inout5,0x50($out)
+       movdqu  0x50($inp),$inout5
+       movups  $inout6,0x60($out)
+       movdqu  0x60($inp),$inout6
+       movups  $inout7,0x70($out)
+       lea     0x80($out),$out
+       movdqu  0x70($inp),$inout7
+       lea     0x80($inp),$inp
+.Lecb_dec_loop8_enter:
+
+       call    _aesni_decrypt8
+
+       $movkey ($key_),$rndkey0
+       sub     \$0x80,$len
+       jnc     .Lecb_dec_loop8
+
+       movups  $inout0,($out)
+       mov     $key_,$key              # restore $key
+       movups  $inout1,0x10($out)
+       mov     $rnds_,$rounds          # restore $rounds
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       movups  $inout4,0x40($out)
+       movups  $inout5,0x50($out)
+       movups  $inout6,0x60($out)
+       movups  $inout7,0x70($out)
+       lea     0x80($out),$out
+       add     \$0x80,$len
+       jz      .Lecb_ret
+
+.Lecb_dec_tail:
+       movups  ($inp),$inout0
+       cmp     \$0x20,$len
+       jb      .Lecb_dec_one
+       movups  0x10($inp),$inout1
+       je      .Lecb_dec_two
+       movups  0x20($inp),$inout2
+       cmp     \$0x40,$len
+       jb      .Lecb_dec_three
+       movups  0x30($inp),$inout3
+       je      .Lecb_dec_four
+       movups  0x40($inp),$inout4
+       cmp     \$0x60,$len
+       jb      .Lecb_dec_five
+       movups  0x50($inp),$inout5
+       je      .Lecb_dec_six
+       movups  0x60($inp),$inout6
+       $movkey ($key),$rndkey0
+       call    _aesni_decrypt8
+       movups  $inout0,($out)
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       movups  $inout4,0x40($out)
+       movups  $inout5,0x50($out)
+       movups  $inout6,0x60($out)
+       jmp     .Lecb_ret
+.align 16
+.Lecb_dec_one:
+___
+       &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+       movups  $inout0,($out)
+       jmp     .Lecb_ret
+.align 16
+.Lecb_dec_two:
+       xorps   $inout2,$inout2
+       call    _aesni_decrypt3
+       movups  $inout0,($out)
+       movups  $inout1,0x10($out)
+       jmp     .Lecb_ret
+.align 16
+.Lecb_dec_three:
+       call    _aesni_decrypt3
+       movups  $inout0,($out)
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       jmp     .Lecb_ret
+.align 16
+.Lecb_dec_four:
+       call    _aesni_decrypt4
+       movups  $inout0,($out)
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       jmp     .Lecb_ret
+.align 16
+.Lecb_dec_five:
+       xorps   $inout5,$inout5
+       call    _aesni_decrypt6
+       movups  $inout0,($out)
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       movups  $inout4,0x40($out)
+       jmp     .Lecb_ret
+.align 16
+.Lecb_dec_six:
+       call    _aesni_decrypt6
+       movups  $inout0,($out)
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       movups  $inout4,0x40($out)
+       movups  $inout5,0x50($out)
+
+.Lecb_ret:
+       ret
+.size  aesni_ecb_encrypt,.-aesni_ecb_encrypt
+___
+
+{
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{
+my $cmac="%r9";        # 6th argument
+
+my $increment="%xmm6";
+my $bswap_mask="%xmm7";
+
+$code.=<<___;
+.globl aesni_ccm64_encrypt_blocks
+.type  aesni_ccm64_encrypt_blocks,address@hidden,6
+.align 16
+aesni_ccm64_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+       lea     -0x58(%rsp),%rsp
+       movaps  %xmm6,(%rsp)
+       movaps  %xmm7,0x10(%rsp)
+       movaps  %xmm8,0x20(%rsp)
+       movaps  %xmm9,0x30(%rsp)
+.Lccm64_enc_body:
+___
+$code.=<<___;
+       mov     240($key),$rounds               # key->rounds
+       movdqu  ($ivp),$iv
+       movdqa  .Lincrement64(%rip),$increment
+       movdqa  .Lbswap_mask(%rip),$bswap_mask
+
+       shr     \$1,$rounds
+       lea     0($key),$key_
+       movdqu  ($cmac),$inout1
+       movdqa  $iv,$inout0
+       mov     $rounds,$rnds_
+       pshufb  $bswap_mask,$iv
+       jmp     .Lccm64_enc_outer
+.align 16
+.Lccm64_enc_outer:
+       $movkey ($key_),$rndkey0
+       mov     $rnds_,$rounds
+       movups  ($inp),$in0                     # load inp
+
+       xorps   $rndkey0,$inout0                # counter
+       $movkey 16($key_),$rndkey1
+       xorps   $in0,$rndkey0
+       lea     32($key_),$key
+       xorps   $rndkey0,$inout1                # cmac^=inp
+       $movkey ($key),$rndkey0
+
+.Lccm64_enc2_loop:
+       aesenc  $rndkey1,$inout0
+       dec     $rounds
+       aesenc  $rndkey1,$inout1
+       $movkey 16($key),$rndkey1
+       aesenc  $rndkey0,$inout0
+       lea     32($key),$key
+       aesenc  $rndkey0,$inout1
+       $movkey 0($key),$rndkey0
+       jnz     .Lccm64_enc2_loop
+       aesenc  $rndkey1,$inout0
+       aesenc  $rndkey1,$inout1
+       paddq   $increment,$iv
+       aesenclast      $rndkey0,$inout0
+       aesenclast      $rndkey0,$inout1
+
+       dec     $len
+       lea     16($inp),$inp
+       xorps   $inout0,$in0                    # inp ^= E(iv)
+       movdqa  $iv,$inout0
+       movups  $in0,($out)                     # save output
+       lea     16($out),$out
+       pshufb  $bswap_mask,$inout0
+       jnz     .Lccm64_enc_outer
+
+       movups  $inout1,($cmac)
+___
+$code.=<<___ if ($win64);
+       movaps  (%rsp),%xmm6
+       movaps  0x10(%rsp),%xmm7
+       movaps  0x20(%rsp),%xmm8
+       movaps  0x30(%rsp),%xmm9
+       lea     0x58(%rsp),%rsp
+.Lccm64_enc_ret:
+___
+$code.=<<___;
+       ret
+.size  aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
+___
+######################################################################
+$code.=<<___;
+.globl aesni_ccm64_decrypt_blocks
+.type  aesni_ccm64_decrypt_blocks,address@hidden,6
+.align 16
+aesni_ccm64_decrypt_blocks:
+___
+$code.=<<___ if ($win64);
+       lea     -0x58(%rsp),%rsp
+       movaps  %xmm6,(%rsp)
+       movaps  %xmm7,0x10(%rsp)
+       movaps  %xmm8,0x20(%rsp)
+       movaps  %xmm9,0x30(%rsp)
+.Lccm64_dec_body:
+___
+$code.=<<___;
+       mov     240($key),$rounds               # key->rounds
+       movups  ($ivp),$iv
+       movdqu  ($cmac),$inout1
+       movdqa  .Lincrement64(%rip),$increment
+       movdqa  .Lbswap_mask(%rip),$bswap_mask
+
+       movaps  $iv,$inout0
+       mov     $rounds,$rnds_
+       mov     $key,$key_
+       pshufb  $bswap_mask,$iv
+___
+       &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+       movups  ($inp),$in0                     # load inp
+       paddq   $increment,$iv
+       lea     16($inp),$inp
+       jmp     .Lccm64_dec_outer
+.align 16
+.Lccm64_dec_outer:
+       xorps   $inout0,$in0                    # inp ^= E(iv)
+       movdqa  $iv,$inout0
+       mov     $rnds_,$rounds
+       movups  $in0,($out)                     # save output
+       lea     16($out),$out
+       pshufb  $bswap_mask,$inout0
+
+       sub     \$1,$len
+       jz      .Lccm64_dec_break
+
+       $movkey ($key_),$rndkey0
+       shr     \$1,$rounds
+       $movkey 16($key_),$rndkey1
+       xorps   $rndkey0,$in0
+       lea     32($key_),$key
+       xorps   $rndkey0,$inout0
+       xorps   $in0,$inout1                    # cmac^=out
+       $movkey ($key),$rndkey0
+
+.Lccm64_dec2_loop:
+       aesenc  $rndkey1,$inout0
+       dec     $rounds
+       aesenc  $rndkey1,$inout1
+       $movkey 16($key),$rndkey1
+       aesenc  $rndkey0,$inout0
+       lea     32($key),$key
+       aesenc  $rndkey0,$inout1
+       $movkey 0($key),$rndkey0
+       jnz     .Lccm64_dec2_loop
+       movups  ($inp),$in0                     # load inp
+       paddq   $increment,$iv
+       aesenc  $rndkey1,$inout0
+       aesenc  $rndkey1,$inout1
+       lea     16($inp),$inp
+       aesenclast      $rndkey0,$inout0
+       aesenclast      $rndkey0,$inout1
+       jmp     .Lccm64_dec_outer
+
+.align 16
+.Lccm64_dec_break:
+       #xorps  $in0,$inout1                    # cmac^=out
+___
+       &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
+$code.=<<___;
+       movups  $inout1,($cmac)
+___
+$code.=<<___ if ($win64);
+       movaps  (%rsp),%xmm6
+       movaps  0x10(%rsp),%xmm7
+       movaps  0x20(%rsp),%xmm8
+       movaps  0x30(%rsp),%xmm9
+       lea     0x58(%rsp),%rsp
+.Lccm64_dec_ret:
+___
+$code.=<<___;
+       ret
+.size  aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
+___
+}
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+{
+my $reserved = $win64?0:-0x28;
+my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
+my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
+my $bswap_mask="%xmm15";
+
+$code.=<<___;
+.globl aesni_ctr32_encrypt_blocks
+.type  aesni_ctr32_encrypt_blocks,address@hidden,5
+.align 16
+aesni_ctr32_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+       lea     -0xc8(%rsp),%rsp
+       movaps  %xmm6,0x20(%rsp)
+       movaps  %xmm7,0x30(%rsp)
+       movaps  %xmm8,0x40(%rsp)
+       movaps  %xmm9,0x50(%rsp)
+       movaps  %xmm10,0x60(%rsp)
+       movaps  %xmm11,0x70(%rsp)
+       movaps  %xmm12,0x80(%rsp)
+       movaps  %xmm13,0x90(%rsp)
+       movaps  %xmm14,0xa0(%rsp)
+       movaps  %xmm15,0xb0(%rsp)
+.Lctr32_body:
+___
+$code.=<<___;
+       cmp     \$1,$len
+       je      .Lctr32_one_shortcut
+
+       movdqu  ($ivp),$ivec
+       movdqa  .Lbswap_mask(%rip),$bswap_mask
+       xor     $rounds,$rounds
+       pextrd  \$3,$ivec,$rnds_                # pull 32-bit counter
+       pinsrd  \$3,$rounds,$ivec               # wipe 32-bit counter
+
+       mov     240($key),$rounds               # key->rounds
+       bswap   $rnds_
+       pxor    $iv0,$iv0                       # vector of 3 32-bit counters
+       pxor    $iv1,$iv1                       # vector of 3 32-bit counters
+       pinsrd  \$0,$rnds_,$iv0
+       lea     3($rnds_),$key_
+       pinsrd  \$0,$key_,$iv1
+       inc     $rnds_
+       pinsrd  \$1,$rnds_,$iv0
+       inc     $key_
+       pinsrd  \$1,$key_,$iv1
+       inc     $rnds_
+       pinsrd  \$2,$rnds_,$iv0
+       inc     $key_
+       pinsrd  \$2,$key_,$iv1
+       movdqa  $iv0,$reserved(%rsp)
+       pshufb  $bswap_mask,$iv0
+       movdqa  $iv1,`$reserved+0x10`(%rsp)
+       pshufb  $bswap_mask,$iv1
+
+       pshufd  \$`3<<6`,$iv0,$inout0           # place counter to upper dword
+       pshufd  \$`2<<6`,$iv0,$inout1
+       pshufd  \$`1<<6`,$iv0,$inout2
+       cmp     \$6,$len
+       jb      .Lctr32_tail
+       shr     \$1,$rounds
+       mov     $key,$key_                      # backup $key
+       mov     $rounds,$rnds_                  # backup $rounds
+       sub     \$6,$len
+       jmp     .Lctr32_loop6
+
+.align 16
+.Lctr32_loop6:
+       pshufd  \$`3<<6`,$iv1,$inout3
+       por     $ivec,$inout0                   # merge counter-less ivec
+        $movkey        ($key_),$rndkey0
+       pshufd  \$`2<<6`,$iv1,$inout4
+       por     $ivec,$inout1
+        $movkey        16($key_),$rndkey1
+       pshufd  \$`1<<6`,$iv1,$inout5
+       por     $ivec,$inout2
+       por     $ivec,$inout3
+        xorps          $rndkey0,$inout0
+       por     $ivec,$inout4
+       por     $ivec,$inout5
+
+       # inline _aesni_encrypt6 and interleave last rounds
+       # with own code...
+
+       pxor            $rndkey0,$inout1
+       aesenc          $rndkey1,$inout0
+       lea             32($key_),$key
+       pxor            $rndkey0,$inout2
+       aesenc          $rndkey1,$inout1
+        movdqa         .Lincrement32(%rip),$iv1
+       pxor            $rndkey0,$inout3
+       aesenc          $rndkey1,$inout2
+        movdqa         $reserved(%rsp),$iv0
+       pxor            $rndkey0,$inout4
+       aesenc          $rndkey1,$inout3
+       pxor            $rndkey0,$inout5
+       $movkey         ($key),$rndkey0
+       dec             $rounds
+       aesenc          $rndkey1,$inout4
+       aesenc          $rndkey1,$inout5
+       jmp             .Lctr32_enc_loop6_enter
+.align 16
+.Lctr32_enc_loop6:
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       dec             $rounds
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+       aesenc          $rndkey1,$inout4
+       aesenc          $rndkey1,$inout5
+.Lctr32_enc_loop6_enter:
+       $movkey         16($key),$rndkey1
+       aesenc          $rndkey0,$inout0
+       aesenc          $rndkey0,$inout1
+       lea             32($key),$key
+       aesenc          $rndkey0,$inout2
+       aesenc          $rndkey0,$inout3
+       aesenc          $rndkey0,$inout4
+       aesenc          $rndkey0,$inout5
+       $movkey         ($key),$rndkey0
+       jnz             .Lctr32_enc_loop6
+
+       aesenc          $rndkey1,$inout0
+        paddd          $iv1,$iv0               # increment counter vector
+       aesenc          $rndkey1,$inout1
+        paddd          `$reserved+0x10`(%rsp),$iv1
+       aesenc          $rndkey1,$inout2
+        movdqa         $iv0,$reserved(%rsp)    # save counter vector
+       aesenc          $rndkey1,$inout3
+        movdqa         $iv1,`$reserved+0x10`(%rsp)
+       aesenc          $rndkey1,$inout4
+        pshufb         $bswap_mask,$iv0        # byte swap
+       aesenc          $rndkey1,$inout5
+        pshufb         $bswap_mask,$iv1
+
+       aesenclast      $rndkey0,$inout0
+        movups         ($inp),$in0             # load input
+       aesenclast      $rndkey0,$inout1
+        movups         0x10($inp),$in1
+       aesenclast      $rndkey0,$inout2
+        movups         0x20($inp),$in2
+       aesenclast      $rndkey0,$inout3
+        movups         0x30($inp),$in3
+       aesenclast      $rndkey0,$inout4
+        movups         0x40($inp),$rndkey1
+       aesenclast      $rndkey0,$inout5
+        movups         0x50($inp),$rndkey0
+        lea    0x60($inp),$inp
+
+       xorps   $inout0,$in0                    # xor
+        pshufd \$`3<<6`,$iv0,$inout0
+       xorps   $inout1,$in1
+        pshufd \$`2<<6`,$iv0,$inout1
+       movups  $in0,($out)                     # store output
+       xorps   $inout2,$in2
+        pshufd \$`1<<6`,$iv0,$inout2
+       movups  $in1,0x10($out)
+       xorps   $inout3,$in3
+       movups  $in2,0x20($out)
+       xorps   $inout4,$rndkey1
+       movups  $in3,0x30($out)
+       xorps   $inout5,$rndkey0
+       movups  $rndkey1,0x40($out)
+       movups  $rndkey0,0x50($out)
+       lea     0x60($out),$out
+       mov     $rnds_,$rounds
+       sub     \$6,$len
+       jnc     .Lctr32_loop6
+
+       add     \$6,$len
+       jz      .Lctr32_done
+       mov     $key_,$key                      # restore $key
+       lea     1($rounds,$rounds),$rounds      # restore original value
+
+.Lctr32_tail:
+       por     $ivec,$inout0
+       movups  ($inp),$in0
+       cmp     \$2,$len
+       jb      .Lctr32_one
+
+       por     $ivec,$inout1
+       movups  0x10($inp),$in1
+       je      .Lctr32_two
+
+       pshufd  \$`3<<6`,$iv1,$inout3
+       por     $ivec,$inout2
+       movups  0x20($inp),$in2
+       cmp     \$4,$len
+       jb      .Lctr32_three
+
+       pshufd  \$`2<<6`,$iv1,$inout4
+       por     $ivec,$inout3
+       movups  0x30($inp),$in3
+       je      .Lctr32_four
+
+       por     $ivec,$inout4
+       xorps   $inout5,$inout5
+
+       call    _aesni_encrypt6
+
+       movups  0x40($inp),$rndkey1
+       xorps   $inout0,$in0
+       xorps   $inout1,$in1
+       movups  $in0,($out)
+       xorps   $inout2,$in2
+       movups  $in1,0x10($out)
+       xorps   $inout3,$in3
+       movups  $in2,0x20($out)
+       xorps   $inout4,$rndkey1
+       movups  $in3,0x30($out)
+       movups  $rndkey1,0x40($out)
+       jmp     .Lctr32_done
+
+.align 16
+.Lctr32_one_shortcut:
+       movups  ($ivp),$inout0
+       movups  ($inp),$in0
+       mov     240($key),$rounds               # key->rounds
+.Lctr32_one:
+___
+       &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+       xorps   $inout0,$in0
+       movups  $in0,($out)
+       jmp     .Lctr32_done
+
+.align 16
+.Lctr32_two:
+       xorps   $inout2,$inout2
+       call    _aesni_encrypt3
+       xorps   $inout0,$in0
+       xorps   $inout1,$in1
+       movups  $in0,($out)
+       movups  $in1,0x10($out)
+       jmp     .Lctr32_done
+
+.align 16
+.Lctr32_three:
+       call    _aesni_encrypt3
+       xorps   $inout0,$in0
+       xorps   $inout1,$in1
+       movups  $in0,($out)
+       xorps   $inout2,$in2
+       movups  $in1,0x10($out)
+       movups  $in2,0x20($out)
+       jmp     .Lctr32_done
+
+.align 16
+.Lctr32_four:
+       call    _aesni_encrypt4
+       xorps   $inout0,$in0
+       xorps   $inout1,$in1
+       movups  $in0,($out)
+       xorps   $inout2,$in2
+       movups  $in1,0x10($out)
+       xorps   $inout3,$in3
+       movups  $in2,0x20($out)
+       movups  $in3,0x30($out)
+
+.Lctr32_done:
+___
+$code.=<<___ if ($win64);
+       movaps  0x20(%rsp),%xmm6
+       movaps  0x30(%rsp),%xmm7
+       movaps  0x40(%rsp),%xmm8
+       movaps  0x50(%rsp),%xmm9
+       movaps  0x60(%rsp),%xmm10
+       movaps  0x70(%rsp),%xmm11
+       movaps  0x80(%rsp),%xmm12
+       movaps  0x90(%rsp),%xmm13
+       movaps  0xa0(%rsp),%xmm14
+       movaps  0xb0(%rsp),%xmm15
+       lea     0xc8(%rsp),%rsp
+.Lctr32_ret:
+___
+$code.=<<___;
+       ret
+.size  aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
+___
+}
+
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#      const AES_KEY *key1, const AES_KEY *key2
+#      const unsigned char iv[16]);
+#
+{
+my @tweak=map("%xmm$_",(10..15));
+my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
+my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
+my $frame_size = 0x68 + ($win64?160:0);
+
+$code.=<<___;
+.globl aesni_xts_encrypt
+.type  aesni_xts_encrypt,address@hidden,6
+.align 16
+aesni_xts_encrypt:
+       lea     -$frame_size(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+       movaps  %xmm6,0x60(%rsp)
+       movaps  %xmm7,0x70(%rsp)
+       movaps  %xmm8,0x80(%rsp)
+       movaps  %xmm9,0x90(%rsp)
+       movaps  %xmm10,0xa0(%rsp)
+       movaps  %xmm11,0xb0(%rsp)
+       movaps  %xmm12,0xc0(%rsp)
+       movaps  %xmm13,0xd0(%rsp)
+       movaps  %xmm14,0xe0(%rsp)
+       movaps  %xmm15,0xf0(%rsp)
+.Lxts_enc_body:
+___
+$code.=<<___;
+       movups  ($ivp),@tweak[5]                # load clear-text tweak
+       mov     240(%r8),$rounds                # key2->rounds
+       mov     240($key),$rnds_                # key1->rounds
+___
+       # generate the tweak
+       &aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+$code.=<<___;
+       mov     $key,$key_                      # backup $key
+       mov     $rnds_,$rounds                  # backup $rounds
+       mov     $len,$len_                      # backup $len
+       and     \$-16,$len
+
+       movdqa  .Lxts_magic(%rip),$twmask
+       pxor    $twtmp,$twtmp
+       pcmpgtd @tweak[5],$twtmp                # broadcast upper bits
+___
+    for ($i=0;$i<4;$i++) {
+    $code.=<<___;
+       pshufd  \$0x13,$twtmp,$twres
+       pxor    $twtmp,$twtmp
+       movdqa  @tweak[5],@tweak[$i]
+       paddq   @tweak[5],@tweak[5]             # psllq 1,$tweak
+       pand    $twmask,$twres                  # isolate carry and residue
+       pcmpgtd @tweak[5],$twtmp                # broadcat upper bits
+       pxor    $twres,@tweak[5]
+___
+    }
+$code.=<<___;
+       sub     \$16*6,$len
+       jc      .Lxts_enc_short
+
+       shr     \$1,$rounds
+       sub     \$1,$rounds
+       mov     $rounds,$rnds_
+       jmp     .Lxts_enc_grandloop
+
+.align 16
+.Lxts_enc_grandloop:
+       pshufd  \$0x13,$twtmp,$twres
+       movdqa  @tweak[5],@tweak[4]
+       paddq   @tweak[5],@tweak[5]             # psllq 1,$tweak
+       movdqu  `16*0`($inp),$inout0            # load input
+       pand    $twmask,$twres                  # isolate carry and residue
+       movdqu  `16*1`($inp),$inout1
+       pxor    $twres,@tweak[5]
+
+       movdqu  `16*2`($inp),$inout2
+       pxor    @tweak[0],$inout0               # input^=tweak
+       movdqu  `16*3`($inp),$inout3
+       pxor    @tweak[1],$inout1
+       movdqu  `16*4`($inp),$inout4
+       pxor    @tweak[2],$inout2
+       movdqu  `16*5`($inp),$inout5
+       lea     `16*6`($inp),$inp
+       pxor    @tweak[3],$inout3
+       $movkey         ($key_),$rndkey0
+       pxor    @tweak[4],$inout4
+       pxor    @tweak[5],$inout5
+
+       # inline _aesni_encrypt6 and interleave first and last rounds
+       # with own code...
+       $movkey         16($key_),$rndkey1
+       pxor            $rndkey0,$inout0
+       pxor            $rndkey0,$inout1
+        movdqa @tweak[0],`16*0`(%rsp)          # put aside tweaks
+       aesenc          $rndkey1,$inout0
+       lea             32($key_),$key
+       pxor            $rndkey0,$inout2
+        movdqa @tweak[1],`16*1`(%rsp)
+       aesenc          $rndkey1,$inout1
+       pxor            $rndkey0,$inout3
+        movdqa @tweak[2],`16*2`(%rsp)
+       aesenc          $rndkey1,$inout2
+       pxor            $rndkey0,$inout4
+        movdqa @tweak[3],`16*3`(%rsp)
+       aesenc          $rndkey1,$inout3
+       pxor            $rndkey0,$inout5
+       $movkey         ($key),$rndkey0
+       dec             $rounds
+        movdqa @tweak[4],`16*4`(%rsp)
+       aesenc          $rndkey1,$inout4
+        movdqa @tweak[5],`16*5`(%rsp)
+       aesenc          $rndkey1,$inout5
+       pxor    $twtmp,$twtmp
+       pcmpgtd @tweak[5],$twtmp
+       jmp             .Lxts_enc_loop6_enter
+
+.align 16
+.Lxts_enc_loop6:
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       dec             $rounds
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+       aesenc          $rndkey1,$inout4
+       aesenc          $rndkey1,$inout5
+.Lxts_enc_loop6_enter:
+       $movkey         16($key),$rndkey1
+       aesenc          $rndkey0,$inout0
+       aesenc          $rndkey0,$inout1
+       lea             32($key),$key
+       aesenc          $rndkey0,$inout2
+       aesenc          $rndkey0,$inout3
+       aesenc          $rndkey0,$inout4
+       aesenc          $rndkey0,$inout5
+       $movkey         ($key),$rndkey0
+       jnz             .Lxts_enc_loop6
+
+       pshufd  \$0x13,$twtmp,$twres
+       pxor    $twtmp,$twtmp
+       paddq   @tweak[5],@tweak[5]             # psllq 1,$tweak
+        aesenc         $rndkey1,$inout0
+       pand    $twmask,$twres                  # isolate carry and residue
+        aesenc         $rndkey1,$inout1
+       pcmpgtd @tweak[5],$twtmp                # broadcast upper bits
+        aesenc         $rndkey1,$inout2
+       pxor    $twres,@tweak[5]
+        aesenc         $rndkey1,$inout3
+        aesenc         $rndkey1,$inout4
+        aesenc         $rndkey1,$inout5
+        $movkey        16($key),$rndkey1
+
+       pshufd  \$0x13,$twtmp,$twres
+       pxor    $twtmp,$twtmp
+       movdqa  @tweak[5],@tweak[0]
+       paddq   @tweak[5],@tweak[5]             # psllq 1,$tweak
+        aesenc         $rndkey0,$inout0
+       pand    $twmask,$twres                  # isolate carry and residue
+        aesenc         $rndkey0,$inout1
+       pcmpgtd @tweak[5],$twtmp                # broadcat upper bits
+        aesenc         $rndkey0,$inout2
+       pxor    $twres,@tweak[5]
+        aesenc         $rndkey0,$inout3
+        aesenc         $rndkey0,$inout4
+        aesenc         $rndkey0,$inout5
+        $movkey        32($key),$rndkey0
+
+       pshufd  \$0x13,$twtmp,$twres
+       pxor    $twtmp,$twtmp
+       movdqa  @tweak[5],@tweak[1]
+       paddq   @tweak[5],@tweak[5]             # psllq 1,$tweak
+        aesenc         $rndkey1,$inout0
+       pand    $twmask,$twres                  # isolate carry and residue
+        aesenc         $rndkey1,$inout1
+       pcmpgtd @tweak[5],$twtmp                # broadcat upper bits
+        aesenc         $rndkey1,$inout2
+       pxor    $twres,@tweak[5]
+        aesenc         $rndkey1,$inout3
+        aesenc         $rndkey1,$inout4
+        aesenc         $rndkey1,$inout5
+
+       pshufd  \$0x13,$twtmp,$twres
+       pxor    $twtmp,$twtmp
+       movdqa  @tweak[5],@tweak[2]
+       paddq   @tweak[5],@tweak[5]             # psllq 1,$tweak
+        aesenclast     $rndkey0,$inout0
+       pand    $twmask,$twres                  # isolate carry and residue
+        aesenclast     $rndkey0,$inout1
+       pcmpgtd @tweak[5],$twtmp                # broadcat upper bits
+        aesenclast     $rndkey0,$inout2
+       pxor    $twres,@tweak[5]
+        aesenclast     $rndkey0,$inout3
+        aesenclast     $rndkey0,$inout4
+        aesenclast     $rndkey0,$inout5
+
+       pshufd  \$0x13,$twtmp,$twres
+       pxor    $twtmp,$twtmp
+       movdqa  @tweak[5],@tweak[3]
+       paddq   @tweak[5],@tweak[5]             # psllq 1,$tweak
+        xorps  `16*0`(%rsp),$inout0            # output^=tweak
+       pand    $twmask,$twres                  # isolate carry and residue
+        xorps  `16*1`(%rsp),$inout1
+       pcmpgtd @tweak[5],$twtmp                # broadcat upper bits
+       pxor    $twres,@tweak[5]
+
+       xorps   `16*2`(%rsp),$inout2
+       movups  $inout0,`16*0`($out)            # write output
+       xorps   `16*3`(%rsp),$inout3
+       movups  $inout1,`16*1`($out)
+       xorps   `16*4`(%rsp),$inout4
+       movups  $inout2,`16*2`($out)
+       xorps   `16*5`(%rsp),$inout5
+       movups  $inout3,`16*3`($out)
+       mov     $rnds_,$rounds                  # restore $rounds
+       movups  $inout4,`16*4`($out)
+       movups  $inout5,`16*5`($out)
+       lea     `16*6`($out),$out
+       sub     \$16*6,$len
+       jnc     .Lxts_enc_grandloop
+
+       lea     3($rounds,$rounds),$rounds      # restore original value
+       mov     $key_,$key                      # restore $key
+       mov     $rounds,$rnds_                  # backup $rounds
+
+.Lxts_enc_short:
+       add     \$16*6,$len
+       jz      .Lxts_enc_done
+
+       cmp     \$0x20,$len
+       jb      .Lxts_enc_one
+       je      .Lxts_enc_two
+
+       cmp     \$0x40,$len
+       jb      .Lxts_enc_three
+       je      .Lxts_enc_four
+
+       pshufd  \$0x13,$twtmp,$twres
+       movdqa  @tweak[5],@tweak[4]
+       paddq   @tweak[5],@tweak[5]             # psllq 1,$tweak
+        movdqu ($inp),$inout0
+       pand    $twmask,$twres                  # isolate carry and residue
+        movdqu 16*1($inp),$inout1
+       pxor    $twres,@tweak[5]
+
+       movdqu  16*2($inp),$inout2
+       pxor    @tweak[0],$inout0
+       movdqu  16*3($inp),$inout3
+       pxor    @tweak[1],$inout1
+       movdqu  16*4($inp),$inout4
+       lea     16*5($inp),$inp
+       pxor    @tweak[2],$inout2
+       pxor    @tweak[3],$inout3
+       pxor    @tweak[4],$inout4
+
+       call    _aesni_encrypt6
+
+       xorps   @tweak[0],$inout0
+       movdqa  @tweak[5],@tweak[0]
+       xorps   @tweak[1],$inout1
+       xorps   @tweak[2],$inout2
+       movdqu  $inout0,($out)
+       xorps   @tweak[3],$inout3
+       movdqu  $inout1,16*1($out)
+       xorps   @tweak[4],$inout4
+       movdqu  $inout2,16*2($out)
+       movdqu  $inout3,16*3($out)
+       movdqu  $inout4,16*4($out)
+       lea     16*5($out),$out
+       jmp     .Lxts_enc_done
+
+.align 16
+.Lxts_enc_one:
+       movups  ($inp),$inout0
+       lea     16*1($inp),$inp
+       xorps   @tweak[0],$inout0
+___
+       &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+       xorps   @tweak[0],$inout0
+       movdqa  @tweak[1],@tweak[0]
+       movups  $inout0,($out)
+       lea     16*1($out),$out
+       jmp     .Lxts_enc_done
+
+.align 16
+.Lxts_enc_two:
+       movups  ($inp),$inout0
+       movups  16($inp),$inout1
+       lea     32($inp),$inp
+       xorps   @tweak[0],$inout0
+       xorps   @tweak[1],$inout1
+
+       call    _aesni_encrypt3
+
+       xorps   @tweak[0],$inout0
+       movdqa  @tweak[2],@tweak[0]
+       xorps   @tweak[1],$inout1
+       movups  $inout0,($out)
+       movups  $inout1,16*1($out)
+       lea     16*2($out),$out
+       jmp     .Lxts_enc_done
+
+.align 16
+.Lxts_enc_three:
+       movups  ($inp),$inout0
+       movups  16*1($inp),$inout1
+       movups  16*2($inp),$inout2
+       lea     16*3($inp),$inp
+       xorps   @tweak[0],$inout0
+       xorps   @tweak[1],$inout1
+       xorps   @tweak[2],$inout2
+
+       call    _aesni_encrypt3
+
+       xorps   @tweak[0],$inout0
+       movdqa  @tweak[3],@tweak[0]
+       xorps   @tweak[1],$inout1
+       xorps   @tweak[2],$inout2
+       movups  $inout0,($out)
+       movups  $inout1,16*1($out)
+       movups  $inout2,16*2($out)
+       lea     16*3($out),$out
+       jmp     .Lxts_enc_done
+
+.align 16
+.Lxts_enc_four:
+       movups  ($inp),$inout0
+       movups  16*1($inp),$inout1
+       movups  16*2($inp),$inout2
+       xorps   @tweak[0],$inout0
+       movups  16*3($inp),$inout3
+       lea     16*4($inp),$inp
+       xorps   @tweak[1],$inout1
+       xorps   @tweak[2],$inout2
+       xorps   @tweak[3],$inout3
+
+       call    _aesni_encrypt4
+
+       xorps   @tweak[0],$inout0
+       movdqa  @tweak[5],@tweak[0]
+       xorps   @tweak[1],$inout1
+       xorps   @tweak[2],$inout2
+       movups  $inout0,($out)
+       xorps   @tweak[3],$inout3
+       movups  $inout1,16*1($out)
+       movups  $inout2,16*2($out)
+       movups  $inout3,16*3($out)
+       lea     16*4($out),$out
+       jmp     .Lxts_enc_done
+
+.align 16
+.Lxts_enc_done:
+       and     \$15,$len_
+       jz      .Lxts_enc_ret
+       mov     $len_,$len
+
+.Lxts_enc_steal:
+       movzb   ($inp),%eax                     # borrow $rounds ...
+       movzb   -16($out),%ecx                  # ... and $key
+       lea     1($inp),$inp
+       mov     %al,-16($out)
+       mov     %cl,0($out)
+       lea     1($out),$out
+       sub     \$1,$len
+       jnz     .Lxts_enc_steal
+
+       sub     $len_,$out                      # rewind $out
+       mov     $key_,$key                      # restore $key
+       mov     $rnds_,$rounds                  # restore $rounds
+
+       movups  -16($out),$inout0
+       xorps   @tweak[0],$inout0
+___
+       &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+       xorps   @tweak[0],$inout0
+       movups  $inout0,-16($out)
+
+.Lxts_enc_ret:
+___
+$code.=<<___ if ($win64);
+       movaps  0x60(%rsp),%xmm6
+       movaps  0x70(%rsp),%xmm7
+       movaps  0x80(%rsp),%xmm8
+       movaps  0x90(%rsp),%xmm9
+       movaps  0xa0(%rsp),%xmm10
+       movaps  0xb0(%rsp),%xmm11
+       movaps  0xc0(%rsp),%xmm12
+       movaps  0xd0(%rsp),%xmm13
+       movaps  0xe0(%rsp),%xmm14
+       movaps  0xf0(%rsp),%xmm15
+___
+$code.=<<___;
+       lea     $frame_size(%rsp),%rsp
+.Lxts_enc_epilogue:
+       ret
+.size  aesni_xts_encrypt,.-aesni_xts_encrypt
+___
+
+$code.=<<___;
+.globl aesni_xts_decrypt
+.type  aesni_xts_decrypt,address@hidden,6
+.align 16
+aesni_xts_decrypt:
+       lea     -$frame_size(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+       movaps  %xmm6,0x60(%rsp)
+       movaps  %xmm7,0x70(%rsp)
+       movaps  %xmm8,0x80(%rsp)
+       movaps  %xmm9,0x90(%rsp)
+       movaps  %xmm10,0xa0(%rsp)
+       movaps  %xmm11,0xb0(%rsp)
+       movaps  %xmm12,0xc0(%rsp)
+       movaps  %xmm13,0xd0(%rsp)
+       movaps  %xmm14,0xe0(%rsp)
+       movaps  %xmm15,0xf0(%rsp)
+.Lxts_dec_body:
+___
+$code.=<<___;
+       movups  ($ivp),@tweak[5]                # load clear-text tweak
+       mov     240($key2),$rounds              # key2->rounds
+       mov     240($key),$rnds_                # key1->rounds
+___
+       # generate the tweak
+       &aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+$code.=<<___;
+       xor     %eax,%eax                       # if ($len%16) len-=16;
+       test    \$15,$len
+       setnz   %al
+       shl     \$4,%rax
+       sub     %rax,$len
+
+       mov     $key,$key_                      # backup $key
+       mov     $rnds_,$rounds                  # backup $rounds
+       mov     $len,$len_                      # backup $len
+       and     \$-16,$len
+
+       movdqa  .Lxts_magic(%rip),$twmask
+       pxor    $twtmp,$twtmp
+       pcmpgtd @tweak[5],$twtmp                # broadcast upper bits
+___
+    for ($i=0;$i<4;$i++) {
+    $code.=<<___;
+       pshufd  \$0x13,$twtmp,$twres
+       pxor    $twtmp,$twtmp
+       movdqa  @tweak[5],@tweak[$i]
+       paddq   @tweak[5],@tweak[5]             # psllq 1,$tweak
+       pand    $twmask,$twres                  # isolate carry and residue
+       pcmpgtd @tweak[5],$twtmp                # broadcat upper bits
+       pxor    $twres,@tweak[5]
+___
+    }
+$code.=<<___;
+       sub     \$16*6,$len
+       jc      .Lxts_dec_short
+
+       shr     \$1,$rounds
+       sub     \$1,$rounds
+       mov     $rounds,$rnds_
+       jmp     .Lxts_dec_grandloop
+
+.align 16
+.Lxts_dec_grandloop:
+       pshufd  \$0x13,$twtmp,$twres
+       movdqa  @tweak[5],@tweak[4]
+       paddq   @tweak[5],@tweak[5]             # psllq 1,$tweak
+       movdqu  `16*0`($inp),$inout0            # load input
+       pand    $twmask,$twres                  # isolate carry and residue
+       movdqu  `16*1`($inp),$inout1
+       pxor    $twres,@tweak[5]
+
+       movdqu  `16*2`($inp),$inout2
+       pxor    @tweak[0],$inout0               # input^=tweak
+       movdqu  `16*3`($inp),$inout3
+       pxor    @tweak[1],$inout1
+       movdqu  `16*4`($inp),$inout4
+       pxor    @tweak[2],$inout2
+       movdqu  `16*5`($inp),$inout5
+       lea     `16*6`($inp),$inp
+       pxor    @tweak[3],$inout3
+       $movkey         ($key_),$rndkey0
+       pxor    @tweak[4],$inout4
+       pxor    @tweak[5],$inout5
+
+       # inline _aesni_decrypt6 and interleave first and last rounds
+       # with own code...
+       $movkey         16($key_),$rndkey1
+       pxor            $rndkey0,$inout0
+       pxor            $rndkey0,$inout1
+        movdqa @tweak[0],`16*0`(%rsp)          # put aside tweaks
+       aesdec          $rndkey1,$inout0
+       lea             32($key_),$key
+       pxor            $rndkey0,$inout2
+        movdqa @tweak[1],`16*1`(%rsp)
+       aesdec          $rndkey1,$inout1
+       pxor            $rndkey0,$inout3
+        movdqa @tweak[2],`16*2`(%rsp)
+       aesdec          $rndkey1,$inout2
+       pxor            $rndkey0,$inout4
+        movdqa @tweak[3],`16*3`(%rsp)
+       aesdec          $rndkey1,$inout3
+       pxor            $rndkey0,$inout5
+       $movkey         ($key),$rndkey0
+       dec             $rounds
+        movdqa @tweak[4],`16*4`(%rsp)
+       aesdec          $rndkey1,$inout4
+        movdqa @tweak[5],`16*5`(%rsp)
+       aesdec          $rndkey1,$inout5
+       pxor    $twtmp,$twtmp
+       pcmpgtd @tweak[5],$twtmp
+       jmp             .Lxts_dec_loop6_enter
+
+.align 16
+.Lxts_dec_loop6:
+       aesdec          $rndkey1,$inout0
+       aesdec          $rndkey1,$inout1
+       dec             $rounds
+       aesdec          $rndkey1,$inout2
+       aesdec          $rndkey1,$inout3
+       aesdec          $rndkey1,$inout4
+       aesdec          $rndkey1,$inout5
+.Lxts_dec_loop6_enter:
+       $movkey         16($key),$rndkey1
+       aesdec          $rndkey0,$inout0
+       aesdec          $rndkey0,$inout1
+       lea             32($key),$key
+       aesdec          $rndkey0,$inout2
+       aesdec          $rndkey0,$inout3
+       aesdec          $rndkey0,$inout4
+       aesdec          $rndkey0,$inout5
+       $movkey         ($key),$rndkey0
+       jnz             .Lxts_dec_loop6
+
+       pshufd  \$0x13,$twtmp,$twres
+       pxor    $twtmp,$twtmp
+       paddq   @tweak[5],@tweak[5]             # psllq 1,$tweak
+        aesdec         $rndkey1,$inout0
+       pand    $twmask,$twres                  # isolate carry and residue
+        aesdec         $rndkey1,$inout1
+       pcmpgtd @tweak[5],$twtmp                # broadcast upper bits
+        aesdec         $rndkey1,$inout2
+       pxor    $twres,@tweak[5]
+        aesdec         $rndkey1,$inout3
+        aesdec         $rndkey1,$inout4
+        aesdec         $rndkey1,$inout5
+        $movkey        16($key),$rndkey1
+
+       pshufd  \$0x13,$twtmp,$twres
+       pxor    $twtmp,$twtmp
+       movdqa  @tweak[5],@tweak[0]
+       paddq   @tweak[5],@tweak[5]             # psllq 1,$tweak
+        aesdec         $rndkey0,$inout0
+       pand    $twmask,$twres                  # isolate carry and residue
+        aesdec         $rndkey0,$inout1
+       pcmpgtd @tweak[5],$twtmp                # broadcat upper bits
+        aesdec         $rndkey0,$inout2
+       pxor    $twres,@tweak[5]
+        aesdec         $rndkey0,$inout3
+        aesdec         $rndkey0,$inout4
+        aesdec         $rndkey0,$inout5
+        $movkey        32($key),$rndkey0
+
+       pshufd  \$0x13,$twtmp,$twres
+       pxor    $twtmp,$twtmp
+       movdqa  @tweak[5],@tweak[1]
+       paddq   @tweak[5],@tweak[5]             # psllq 1,$tweak
+        aesdec         $rndkey1,$inout0
+       pand    $twmask,$twres                  # isolate carry and residue
+        aesdec         $rndkey1,$inout1
+       pcmpgtd @tweak[5],$twtmp                # broadcat upper bits
+        aesdec         $rndkey1,$inout2
+       pxor    $twres,@tweak[5]
+        aesdec         $rndkey1,$inout3
+        aesdec         $rndkey1,$inout4
+        aesdec         $rndkey1,$inout5
+
+       pshufd  \$0x13,$twtmp,$twres
+       pxor    $twtmp,$twtmp
+       movdqa  @tweak[5],@tweak[2]
+       paddq   @tweak[5],@tweak[5]             # psllq 1,$tweak
+        aesdeclast     $rndkey0,$inout0
+       pand    $twmask,$twres                  # isolate carry and residue
+        aesdeclast     $rndkey0,$inout1
+       pcmpgtd @tweak[5],$twtmp                # broadcat upper bits
+        aesdeclast     $rndkey0,$inout2
+       pxor    $twres,@tweak[5]
+        aesdeclast     $rndkey0,$inout3
+        aesdeclast     $rndkey0,$inout4
+        aesdeclast     $rndkey0,$inout5
+
+       pshufd  \$0x13,$twtmp,$twres
+       pxor    $twtmp,$twtmp
+       movdqa  @tweak[5],@tweak[3]
+       paddq   @tweak[5],@tweak[5]             # psllq 1,$tweak
+        xorps  `16*0`(%rsp),$inout0            # output^=tweak
+       pand    $twmask,$twres                  # isolate carry and residue
+        xorps  `16*1`(%rsp),$inout1
+       pcmpgtd @tweak[5],$twtmp                # broadcat upper bits
+       pxor    $twres,@tweak[5]
+
+       xorps   `16*2`(%rsp),$inout2
+       movups  $inout0,`16*0`($out)            # write output
+       xorps   `16*3`(%rsp),$inout3
+       movups  $inout1,`16*1`($out)
+       xorps   `16*4`(%rsp),$inout4
+       movups  $inout2,`16*2`($out)
+       xorps   `16*5`(%rsp),$inout5
+       movups  $inout3,`16*3`($out)
+       mov     $rnds_,$rounds                  # restore $rounds
+       movups  $inout4,`16*4`($out)
+       movups  $inout5,`16*5`($out)
+       lea     `16*6`($out),$out
+       sub     \$16*6,$len
+       jnc     .Lxts_dec_grandloop
+
+       lea     3($rounds,$rounds),$rounds      # restore original value
+       mov     $key_,$key                      # restore $key
+       mov     $rounds,$rnds_                  # backup $rounds
+
+.Lxts_dec_short:
+       add     \$16*6,$len
+       jz      .Lxts_dec_done
+
+       cmp     \$0x20,$len
+       jb      .Lxts_dec_one
+       je      .Lxts_dec_two
+
+       cmp     \$0x40,$len
+       jb      .Lxts_dec_three
+       je      .Lxts_dec_four
+
+       pshufd  \$0x13,$twtmp,$twres
+       movdqa  @tweak[5],@tweak[4]
+       paddq   @tweak[5],@tweak[5]             # psllq 1,$tweak
+        movdqu ($inp),$inout0
+       pand    $twmask,$twres                  # isolate carry and residue
+        movdqu 16*1($inp),$inout1
+       pxor    $twres,@tweak[5]
+
+       movdqu  16*2($inp),$inout2
+       pxor    @tweak[0],$inout0
+       movdqu  16*3($inp),$inout3
+       pxor    @tweak[1],$inout1
+       movdqu  16*4($inp),$inout4
+       lea     16*5($inp),$inp
+       pxor    @tweak[2],$inout2
+       pxor    @tweak[3],$inout3
+       pxor    @tweak[4],$inout4
+
+       call    _aesni_decrypt6
+
+       xorps   @tweak[0],$inout0
+       xorps   @tweak[1],$inout1
+       xorps   @tweak[2],$inout2
+       movdqu  $inout0,($out)
+       xorps   @tweak[3],$inout3
+       movdqu  $inout1,16*1($out)
+       xorps   @tweak[4],$inout4
+       movdqu  $inout2,16*2($out)
+        pxor           $twtmp,$twtmp
+       movdqu  $inout3,16*3($out)
+        pcmpgtd        @tweak[5],$twtmp
+       movdqu  $inout4,16*4($out)
+       lea     16*5($out),$out
+        pshufd         \$0x13,$twtmp,@tweak[1] # $twres
+       and     \$15,$len_
+       jz      .Lxts_dec_ret
+
+       movdqa  @tweak[5],@tweak[0]
+       paddq   @tweak[5],@tweak[5]             # psllq 1,$tweak
+       pand    $twmask,@tweak[1]               # isolate carry and residue
+       pxor    @tweak[5],@tweak[1]
+       jmp     .Lxts_dec_done2
+
+.align 16
+.Lxts_dec_one:
+       movups  ($inp),$inout0
+       lea     16*1($inp),$inp
+       xorps   @tweak[0],$inout0
+___
+       &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+       xorps   @tweak[0],$inout0
+       movdqa  @tweak[1],@tweak[0]
+       movups  $inout0,($out)
+       movdqa  @tweak[2],@tweak[1]
+       lea     16*1($out),$out
+       jmp     .Lxts_dec_done
+
+.align 16
+.Lxts_dec_two:
+       movups  ($inp),$inout0
+       movups  16($inp),$inout1
+       lea     32($inp),$inp
+       xorps   @tweak[0],$inout0
+       xorps   @tweak[1],$inout1
+
+       call    _aesni_decrypt3
+
+       xorps   @tweak[0],$inout0
+       movdqa  @tweak[2],@tweak[0]
+       xorps   @tweak[1],$inout1
+       movdqa  @tweak[3],@tweak[1]
+       movups  $inout0,($out)
+       movups  $inout1,16*1($out)
+       lea     16*2($out),$out
+       jmp     .Lxts_dec_done
+
+.align 16
+.Lxts_dec_three:
+       movups  ($inp),$inout0
+       movups  16*1($inp),$inout1
+       movups  16*2($inp),$inout2
+       lea     16*3($inp),$inp
+       xorps   @tweak[0],$inout0
+       xorps   @tweak[1],$inout1
+       xorps   @tweak[2],$inout2
+
+       call    _aesni_decrypt3
+
+       xorps   @tweak[0],$inout0
+       movdqa  @tweak[3],@tweak[0]
+       xorps   @tweak[1],$inout1
+       movdqa  @tweak[5],@tweak[1]
+       xorps   @tweak[2],$inout2
+       movups  $inout0,($out)
+       movups  $inout1,16*1($out)
+       movups  $inout2,16*2($out)
+       lea     16*3($out),$out
+       jmp     .Lxts_dec_done
+
+.align 16
+.Lxts_dec_four:
+       pshufd  \$0x13,$twtmp,$twres
+       movdqa  @tweak[5],@tweak[4]
+       paddq   @tweak[5],@tweak[5]             # psllq 1,$tweak
+        movups ($inp),$inout0
+       pand    $twmask,$twres                  # isolate carry and residue
+        movups 16*1($inp),$inout1
+       pxor    $twres,@tweak[5]
+
+       movups  16*2($inp),$inout2
+       xorps   @tweak[0],$inout0
+       movups  16*3($inp),$inout3
+       lea     16*4($inp),$inp
+       xorps   @tweak[1],$inout1
+       xorps   @tweak[2],$inout2
+       xorps   @tweak[3],$inout3
+
+       call    _aesni_decrypt4
+
+       xorps   @tweak[0],$inout0
+       movdqa  @tweak[4],@tweak[0]
+       xorps   @tweak[1],$inout1
+       movdqa  @tweak[5],@tweak[1]
+       xorps   @tweak[2],$inout2
+       movups  $inout0,($out)
+       xorps   @tweak[3],$inout3
+       movups  $inout1,16*1($out)
+       movups  $inout2,16*2($out)
+       movups  $inout3,16*3($out)
+       lea     16*4($out),$out
+       jmp     .Lxts_dec_done
+
+.align 16
+.Lxts_dec_done:
+       and     \$15,$len_
+       jz      .Lxts_dec_ret
+.Lxts_dec_done2:
+       mov     $len_,$len
+       mov     $key_,$key                      # restore $key
+       mov     $rnds_,$rounds                  # restore $rounds
+
+       movups  ($inp),$inout0
+       xorps   @tweak[1],$inout0
+___
+       &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+       xorps   @tweak[1],$inout0
+       movups  $inout0,($out)
+
+.Lxts_dec_steal:
+       movzb   16($inp),%eax                   # borrow $rounds ...
+       movzb   ($out),%ecx                     # ... and $key
+       lea     1($inp),$inp
+       mov     %al,($out)
+       mov     %cl,16($out)
+       lea     1($out),$out
+       sub     \$1,$len
+       jnz     .Lxts_dec_steal
+
+       sub     $len_,$out                      # rewind $out
+       mov     $key_,$key                      # restore $key
+       mov     $rnds_,$rounds                  # restore $rounds
+
+       movups  ($out),$inout0
+       xorps   @tweak[0],$inout0
+___
+       &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+       xorps   @tweak[0],$inout0
+       movups  $inout0,($out)
+
+.Lxts_dec_ret:
+___
+$code.=<<___ if ($win64);
+       movaps  0x60(%rsp),%xmm6
+       movaps  0x70(%rsp),%xmm7
+       movaps  0x80(%rsp),%xmm8
+       movaps  0x90(%rsp),%xmm9
+       movaps  0xa0(%rsp),%xmm10
+       movaps  0xb0(%rsp),%xmm11
+       movaps  0xc0(%rsp),%xmm12
+       movaps  0xd0(%rsp),%xmm13
+       movaps  0xe0(%rsp),%xmm14
+       movaps  0xf0(%rsp),%xmm15
+___
+$code.=<<___;
+       lea     $frame_size(%rsp),%rsp
+.Lxts_dec_epilogue:
+       ret
+.size  aesni_xts_decrypt,.-aesni_xts_decrypt
+___
+} }}
+
+########################################################################
+# void $PREFIX_cbc_encrypt (const void *inp, void *out,
+#                          size_t length, const AES_KEY *key,
+#                          unsigned char *ivp,const int enc);
+{
+my $reserved = $win64?0x40:-0x18;      # used in decrypt
+$code.=<<___;
+.globl ${PREFIX}_cbc_encrypt
+.type  ${PREFIX}_cbc_encrypt,address@hidden,6
+.align 16
+${PREFIX}_cbc_encrypt:
+       test    $len,$len               # check length
+       jz      .Lcbc_ret
+
+       mov     240($key),$rnds_        # key->rounds
+       mov     $key,$key_              # backup $key
+       test    %r9d,%r9d               # 6th argument
+       jz      .Lcbc_decrypt
+#--------------------------- CBC ENCRYPT ------------------------------#
+       movups  ($ivp),$inout0          # load iv as initial state
+       mov     $rnds_,$rounds
+       cmp     \$16,$len
+       jb      .Lcbc_enc_tail
+       sub     \$16,$len
+       jmp     .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+       movups  ($inp),$inout1          # load input
+       lea     16($inp),$inp
+       #xorps  $inout1,$inout0
+___
+       &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
+$code.=<<___;
+       mov     $rnds_,$rounds          # restore $rounds
+       mov     $key_,$key              # restore $key
+       movups  $inout0,0($out)         # store output
+       lea     16($out),$out
+       sub     \$16,$len
+       jnc     .Lcbc_enc_loop
+       add     \$16,$len
+       jnz     .Lcbc_enc_tail
+       movups  $inout0,($ivp)
+       jmp     .Lcbc_ret
+
+.Lcbc_enc_tail:
+       mov     $len,%rcx       # zaps $key
+       xchg    $inp,$out       # $inp is %rsi and $out is %rdi now
+       .long   0x9066A4F3      # rep movsb
+       mov     \$16,%ecx       # zero tail
+       sub     $len,%rcx
+       xor     %eax,%eax
+       .long   0x9066AAF3      # rep stosb
+       lea     -16(%rdi),%rdi  # rewind $out by 1 block
+       mov     $rnds_,$rounds  # restore $rounds
+       mov     %rdi,%rsi       # $inp and $out are the same
+       mov     $key_,$key      # restore $key
+       xor     $len,$len       # len=16
+       jmp     .Lcbc_enc_loop  # one more spin
+#--------------------------- CBC DECRYPT ------------------------------#
+.align 16
+.Lcbc_decrypt:
+___
+$code.=<<___ if ($win64);
+       lea     -0x58(%rsp),%rsp
+       movaps  %xmm6,(%rsp)
+       movaps  %xmm7,0x10(%rsp)
+       movaps  %xmm8,0x20(%rsp)
+       movaps  %xmm9,0x30(%rsp)
+.Lcbc_decrypt_body:
+___
+$code.=<<___;
+       movups  ($ivp),$iv
+       mov     $rnds_,$rounds
+       cmp     \$0x70,$len
+       jbe     .Lcbc_dec_tail
+       shr     \$1,$rnds_
+       sub     \$0x70,$len
+       mov     $rnds_,$rounds
+       movaps  $iv,$reserved(%rsp)
+       jmp     .Lcbc_dec_loop8_enter
+.align 16
+.Lcbc_dec_loop8:
+       movaps  $rndkey0,$reserved(%rsp)        # save IV
+       movups  $inout7,($out)
+       lea     0x10($out),$out
+.Lcbc_dec_loop8_enter:
+       $movkey         ($key),$rndkey0
+       movups  ($inp),$inout0                  # load input
+       movups  0x10($inp),$inout1
+       $movkey         16($key),$rndkey1
+
+       lea             32($key),$key
+       movdqu  0x20($inp),$inout2
+       xorps           $rndkey0,$inout0
+       movdqu  0x30($inp),$inout3
+       xorps           $rndkey0,$inout1
+       movdqu  0x40($inp),$inout4
+       aesdec          $rndkey1,$inout0
+       pxor            $rndkey0,$inout2
+       movdqu  0x50($inp),$inout5
+       aesdec          $rndkey1,$inout1
+       pxor            $rndkey0,$inout3
+       movdqu  0x60($inp),$inout6
+       aesdec          $rndkey1,$inout2
+       pxor            $rndkey0,$inout4
+       movdqu  0x70($inp),$inout7
+       aesdec          $rndkey1,$inout3
+       pxor            $rndkey0,$inout5
+       dec             $rounds
+       aesdec          $rndkey1,$inout4
+       pxor            $rndkey0,$inout6
+       aesdec          $rndkey1,$inout5
+       pxor            $rndkey0,$inout7
+       $movkey         ($key),$rndkey0
+       aesdec          $rndkey1,$inout6
+       aesdec          $rndkey1,$inout7
+       $movkey         16($key),$rndkey1
+
+       call            .Ldec_loop8_enter
+
+       movups  ($inp),$rndkey1         # re-load input
+       movups  0x10($inp),$rndkey0
+       xorps   $reserved(%rsp),$inout0 # ^= IV
+       xorps   $rndkey1,$inout1
+       movups  0x20($inp),$rndkey1
+       xorps   $rndkey0,$inout2
+       movups  0x30($inp),$rndkey0
+       xorps   $rndkey1,$inout3
+       movups  0x40($inp),$rndkey1
+       xorps   $rndkey0,$inout4
+       movups  0x50($inp),$rndkey0
+       xorps   $rndkey1,$inout5
+       movups  0x60($inp),$rndkey1
+       xorps   $rndkey0,$inout6
+       movups  0x70($inp),$rndkey0     # IV
+       xorps   $rndkey1,$inout7
+       movups  $inout0,($out)
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       mov     $rnds_,$rounds          # restore $rounds
+       movups  $inout4,0x40($out)
+       mov     $key_,$key              # restore $key
+       movups  $inout5,0x50($out)
+       lea     0x80($inp),$inp
+       movups  $inout6,0x60($out)
+       lea     0x70($out),$out
+       sub     \$0x80,$len
+       ja      .Lcbc_dec_loop8
+
+       movaps  $inout7,$inout0
+       movaps  $rndkey0,$iv
+       add     \$0x70,$len
+       jle     .Lcbc_dec_tail_collected
+       movups  $inout0,($out)
+       lea     1($rnds_,$rnds_),$rounds
+       lea     0x10($out),$out
+.Lcbc_dec_tail:
+       movups  ($inp),$inout0
+       movaps  $inout0,$in0
+       cmp     \$0x10,$len
+       jbe     .Lcbc_dec_one
+
+       movups  0x10($inp),$inout1
+       movaps  $inout1,$in1
+       cmp     \$0x20,$len
+       jbe     .Lcbc_dec_two
+
+       movups  0x20($inp),$inout2
+       movaps  $inout2,$in2
+       cmp     \$0x30,$len
+       jbe     .Lcbc_dec_three
+
+       movups  0x30($inp),$inout3
+       cmp     \$0x40,$len
+       jbe     .Lcbc_dec_four
+
+       movups  0x40($inp),$inout4
+       cmp     \$0x50,$len
+       jbe     .Lcbc_dec_five
+
+       movups  0x50($inp),$inout5
+       cmp     \$0x60,$len
+       jbe     .Lcbc_dec_six
+
+       movups  0x60($inp),$inout6
+       movaps  $iv,$reserved(%rsp)     # save IV
+       call    _aesni_decrypt8
+       movups  ($inp),$rndkey1
+       movups  0x10($inp),$rndkey0
+       xorps   $reserved(%rsp),$inout0 # ^= IV
+       xorps   $rndkey1,$inout1
+       movups  0x20($inp),$rndkey1
+       xorps   $rndkey0,$inout2
+       movups  0x30($inp),$rndkey0
+       xorps   $rndkey1,$inout3
+       movups  0x40($inp),$rndkey1
+       xorps   $rndkey0,$inout4
+       movups  0x50($inp),$rndkey0
+       xorps   $rndkey1,$inout5
+       movups  0x60($inp),$iv          # IV
+       xorps   $rndkey0,$inout6
+       movups  $inout0,($out)
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       movups  $inout4,0x40($out)
+       movups  $inout5,0x50($out)
+       lea     0x60($out),$out
+       movaps  $inout6,$inout0
+       sub     \$0x70,$len
+       jmp     .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_one:
+___
+       &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+       xorps   $iv,$inout0
+       movaps  $in0,$iv
+       sub     \$0x10,$len
+       jmp     .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_two:
+       xorps   $inout2,$inout2
+       call    _aesni_decrypt3
+       xorps   $iv,$inout0
+       xorps   $in0,$inout1
+       movups  $inout0,($out)
+       movaps  $in1,$iv
+       movaps  $inout1,$inout0
+       lea     0x10($out),$out
+       sub     \$0x20,$len
+       jmp     .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_three:
+       call    _aesni_decrypt3
+       xorps   $iv,$inout0
+       xorps   $in0,$inout1
+       movups  $inout0,($out)
+       xorps   $in1,$inout2
+       movups  $inout1,0x10($out)
+       movaps  $in2,$iv
+       movaps  $inout2,$inout0
+       lea     0x20($out),$out
+       sub     \$0x30,$len
+       jmp     .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_four:
+       call    _aesni_decrypt4
+       xorps   $iv,$inout0
+       movups  0x30($inp),$iv
+       xorps   $in0,$inout1
+       movups  $inout0,($out)
+       xorps   $in1,$inout2
+       movups  $inout1,0x10($out)
+       xorps   $in2,$inout3
+       movups  $inout2,0x20($out)
+       movaps  $inout3,$inout0
+       lea     0x30($out),$out
+       sub     \$0x40,$len
+       jmp     .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_five:
+       xorps   $inout5,$inout5
+       call    _aesni_decrypt6
+       movups  0x10($inp),$rndkey1
+       movups  0x20($inp),$rndkey0
+       xorps   $iv,$inout0
+       xorps   $in0,$inout1
+       xorps   $rndkey1,$inout2
+       movups  0x30($inp),$rndkey1
+       xorps   $rndkey0,$inout3
+       movups  0x40($inp),$iv
+       xorps   $rndkey1,$inout4
+       movups  $inout0,($out)
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       lea     0x40($out),$out
+       movaps  $inout4,$inout0
+       sub     \$0x50,$len
+       jmp     .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_six:
+       call    _aesni_decrypt6
+       movups  0x10($inp),$rndkey1
+       movups  0x20($inp),$rndkey0
+       xorps   $iv,$inout0
+       xorps   $in0,$inout1
+       xorps   $rndkey1,$inout2
+       movups  0x30($inp),$rndkey1
+       xorps   $rndkey0,$inout3
+       movups  0x40($inp),$rndkey0
+       xorps   $rndkey1,$inout4
+       movups  0x50($inp),$iv
+       xorps   $rndkey0,$inout5
+       movups  $inout0,($out)
+       movups  $inout1,0x10($out)
+       movups  $inout2,0x20($out)
+       movups  $inout3,0x30($out)
+       movups  $inout4,0x40($out)
+       lea     0x50($out),$out
+       movaps  $inout5,$inout0
+       sub     \$0x60,$len
+       jmp     .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_tail_collected:
+       and     \$15,$len
+       movups  $iv,($ivp)
+       jnz     .Lcbc_dec_tail_partial
+       movups  $inout0,($out)
+       jmp     .Lcbc_dec_ret
+.align 16
+.Lcbc_dec_tail_partial:
+       movaps  $inout0,$reserved(%rsp)
+       mov     \$16,%rcx
+       mov     $out,%rdi
+       sub     $len,%rcx
+       lea     $reserved(%rsp),%rsi
+       .long   0x9066A4F3      # rep movsb
+
+.Lcbc_dec_ret:
+___
+$code.=<<___ if ($win64);
+       movaps  (%rsp),%xmm6
+       movaps  0x10(%rsp),%xmm7
+       movaps  0x20(%rsp),%xmm8
+       movaps  0x30(%rsp),%xmm9
+       lea     0x58(%rsp),%rsp
+___
+$code.=<<___;
+.Lcbc_ret:
+       ret
+.size  ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+} 
+# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
+#                              int bits, AES_KEY *key)
+{ my ($inp,$bits,$key) = @_4args;
+  $bits =~ s/%r/%e/;
+
+$code.=<<___;
+.globl ${PREFIX}_set_decrypt_key
+.type  ${PREFIX}_set_decrypt_key,address@hidden
+.align 16
+${PREFIX}_set_decrypt_key:
+       .byte   0x48,0x83,0xEC,0x08     # sub rsp,8
+       call    __aesni_set_encrypt_key
+       shl     \$4,$bits               # rounds-1 after _aesni_set_encrypt_key
+       test    %eax,%eax
+       jnz     .Ldec_key_ret
+       lea     16($key,$bits),$inp     # points at the end of key schedule
+
+       $movkey ($key),%xmm0            # just swap
+       $movkey ($inp),%xmm1
+       $movkey %xmm0,($inp)
+       $movkey %xmm1,($key)
+       lea     16($key),$key
+       lea     -16($inp),$inp
+
+.Ldec_key_inverse:
+       $movkey ($key),%xmm0            # swap and inverse
+       $movkey ($inp),%xmm1
+       aesimc  %xmm0,%xmm0
+       aesimc  %xmm1,%xmm1
+       lea     16($key),$key
+       lea     -16($inp),$inp
+       $movkey %xmm0,16($inp)
+       $movkey %xmm1,-16($key)
+       cmp     $key,$inp
+       ja      .Ldec_key_inverse
+
+       $movkey ($key),%xmm0            # inverse middle
+       aesimc  %xmm0,%xmm0
+       $movkey %xmm0,($inp)
+.Ldec_key_ret:
+       add     \$8,%rsp
+       ret
+.LSEH_end_set_decrypt_key:
+.size  ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+___
+
+# This is based on submission by
+#
+#      Huang Ying <address@hidden>
+#      Vinodh Gopal <address@hidden>
+#      Kahraman Akdemir
+#
+# Agressively optimized in respect to aeskeygenassist's critical path
+# and is contained in %xmm0-5 to meet Win64 ABI requirement.
+#
+$code.=<<___;
+.globl ${PREFIX}_set_encrypt_key
+.type  ${PREFIX}_set_encrypt_key,address@hidden
+.align 16
+${PREFIX}_set_encrypt_key:
+__aesni_set_encrypt_key:
+       .byte   0x48,0x83,0xEC,0x08     # sub rsp,8
+       mov     \$-1,%rax
+       test    $inp,$inp
+       jz      .Lenc_key_ret
+       test    $key,$key
+       jz      .Lenc_key_ret
+
+       movups  ($inp),%xmm0            # pull first 128 bits of *userKey
+       xorps   %xmm4,%xmm4             # low dword of xmm4 is assumed 0
+       lea     16($key),%rax
+       cmp     \$256,$bits
+       je      .L14rounds
+       cmp     \$192,$bits
+       je      .L12rounds
+       cmp     \$128,$bits
+       jne     .Lbad_keybits
+
+.L10rounds:
+       mov     \$9,$bits                       # 10 rounds for 128-bit key
+       $movkey %xmm0,($key)                    # round 0
+       aeskeygenassist \$0x1,%xmm0,%xmm1       # round 1
+       call            .Lkey_expansion_128_cold
+       aeskeygenassist \$0x2,%xmm0,%xmm1       # round 2
+       call            .Lkey_expansion_128
+       aeskeygenassist \$0x4,%xmm0,%xmm1       # round 3
+       call            .Lkey_expansion_128
+       aeskeygenassist \$0x8,%xmm0,%xmm1       # round 4
+       call            .Lkey_expansion_128
+       aeskeygenassist \$0x10,%xmm0,%xmm1      # round 5
+       call            .Lkey_expansion_128
+       aeskeygenassist \$0x20,%xmm0,%xmm1      # round 6
+       call            .Lkey_expansion_128
+       aeskeygenassist \$0x40,%xmm0,%xmm1      # round 7
+       call            .Lkey_expansion_128
+       aeskeygenassist \$0x80,%xmm0,%xmm1      # round 8
+       call            .Lkey_expansion_128
+       aeskeygenassist \$0x1b,%xmm0,%xmm1      # round 9
+       call            .Lkey_expansion_128
+       aeskeygenassist \$0x36,%xmm0,%xmm1      # round 10
+       call            .Lkey_expansion_128
+       $movkey %xmm0,(%rax)
+       mov     $bits,80(%rax)  # 240(%rdx)
+       xor     %eax,%eax
+       jmp     .Lenc_key_ret
+
+.align 16
+.L12rounds:
+       movq    16($inp),%xmm2                  # remaining 1/3 of *userKey
+       mov     \$11,$bits                      # 12 rounds for 192
+       $movkey %xmm0,($key)                    # round 0
+       aeskeygenassist \$0x1,%xmm2,%xmm1       # round 1,2
+       call            .Lkey_expansion_192a_cold
+       aeskeygenassist \$0x2,%xmm2,%xmm1       # round 2,3
+       call            .Lkey_expansion_192b
+       aeskeygenassist \$0x4,%xmm2,%xmm1       # round 4,5
+       call            .Lkey_expansion_192a
+       aeskeygenassist \$0x8,%xmm2,%xmm1       # round 5,6
+       call            .Lkey_expansion_192b
+       aeskeygenassist \$0x10,%xmm2,%xmm1      # round 7,8
+       call            .Lkey_expansion_192a
+       aeskeygenassist \$0x20,%xmm2,%xmm1      # round 8,9
+       call            .Lkey_expansion_192b
+       aeskeygenassist \$0x40,%xmm2,%xmm1      # round 10,11
+       call            .Lkey_expansion_192a
+       aeskeygenassist \$0x80,%xmm2,%xmm1      # round 11,12
+       call            .Lkey_expansion_192b
+       $movkey %xmm0,(%rax)
+       mov     $bits,48(%rax)  # 240(%rdx)
+       xor     %rax, %rax
+       jmp     .Lenc_key_ret
+
+.align 16
+.L14rounds:
+       movups  16($inp),%xmm2                  # remaning half of *userKey
+       mov     \$13,$bits                      # 14 rounds for 256
+       lea     16(%rax),%rax
+       $movkey %xmm0,($key)                    # round 0
+       $movkey %xmm2,16($key)                  # round 1
+       aeskeygenassist \$0x1,%xmm2,%xmm1       # round 2
+       call            .Lkey_expansion_256a_cold
+       aeskeygenassist \$0x1,%xmm0,%xmm1       # round 3
+       call            .Lkey_expansion_256b
+       aeskeygenassist \$0x2,%xmm2,%xmm1       # round 4
+       call            .Lkey_expansion_256a
+       aeskeygenassist \$0x2,%xmm0,%xmm1       # round 5
+       call            .Lkey_expansion_256b
+       aeskeygenassist \$0x4,%xmm2,%xmm1       # round 6
+       call            .Lkey_expansion_256a
+       aeskeygenassist \$0x4,%xmm0,%xmm1       # round 7
+       call            .Lkey_expansion_256b
+       aeskeygenassist \$0x8,%xmm2,%xmm1       # round 8
+       call            .Lkey_expansion_256a
+       aeskeygenassist \$0x8,%xmm0,%xmm1       # round 9
+       call            .Lkey_expansion_256b
+       aeskeygenassist \$0x10,%xmm2,%xmm1      # round 10
+       call            .Lkey_expansion_256a
+       aeskeygenassist \$0x10,%xmm0,%xmm1      # round 11
+       call            .Lkey_expansion_256b
+       aeskeygenassist \$0x20,%xmm2,%xmm1      # round 12
+       call            .Lkey_expansion_256a
+       aeskeygenassist \$0x20,%xmm0,%xmm1      # round 13
+       call            .Lkey_expansion_256b
+       aeskeygenassist \$0x40,%xmm2,%xmm1      # round 14
+       call            .Lkey_expansion_256a
+       $movkey %xmm0,(%rax)
+       mov     $bits,16(%rax)  # 240(%rdx)
+       xor     %rax,%rax
+       jmp     .Lenc_key_ret
+
+.align 16
+.Lbad_keybits:
+       mov     \$-2,%rax
+.Lenc_key_ret:
+       add     \$8,%rsp
+       ret
+.LSEH_end_set_encrypt_key:
+
+.align 16
+.Lkey_expansion_128:
+       $movkey %xmm0,(%rax)
+       lea     16(%rax),%rax
+.Lkey_expansion_128_cold:
+       shufps  \$0b00010000,%xmm0,%xmm4
+       xorps   %xmm4, %xmm0
+       shufps  \$0b10001100,%xmm0,%xmm4
+       xorps   %xmm4, %xmm0
+       shufps  \$0b11111111,%xmm1,%xmm1        # critical path
+       xorps   %xmm1,%xmm0
+       ret
+
+.align 16
+.Lkey_expansion_192a:
+       $movkey %xmm0,(%rax)
+       lea     16(%rax),%rax
+.Lkey_expansion_192a_cold:
+       movaps  %xmm2, %xmm5
+.Lkey_expansion_192b_warm:
+       shufps  \$0b00010000,%xmm0,%xmm4
+       movdqa  %xmm2,%xmm3
+       xorps   %xmm4,%xmm0
+       shufps  \$0b10001100,%xmm0,%xmm4
+       pslldq  \$4,%xmm3
+       xorps   %xmm4,%xmm0
+       pshufd  \$0b01010101,%xmm1,%xmm1        # critical path
+       pxor    %xmm3,%xmm2
+       pxor    %xmm1,%xmm0
+       pshufd  \$0b11111111,%xmm0,%xmm3
+       pxor    %xmm3,%xmm2
+       ret
+
+.align 16
+.Lkey_expansion_192b:
+       movaps  %xmm0,%xmm3
+       shufps  \$0b01000100,%xmm0,%xmm5
+       $movkey %xmm5,(%rax)
+       shufps  \$0b01001110,%xmm2,%xmm3
+       $movkey %xmm3,16(%rax)
+       lea     32(%rax),%rax
+       jmp     .Lkey_expansion_192b_warm
+
+.align 16
+.Lkey_expansion_256a:
+       $movkey %xmm2,(%rax)
+       lea     16(%rax),%rax
+.Lkey_expansion_256a_cold:
+       shufps  \$0b00010000,%xmm0,%xmm4
+       xorps   %xmm4,%xmm0
+       shufps  \$0b10001100,%xmm0,%xmm4
+       xorps   %xmm4,%xmm0
+       shufps  \$0b11111111,%xmm1,%xmm1        # critical path
+       xorps   %xmm1,%xmm0
+       ret
+
+.align 16
+.Lkey_expansion_256b:
+       $movkey %xmm0,(%rax)
+       lea     16(%rax),%rax
+
+       shufps  \$0b00010000,%xmm2,%xmm4
+       xorps   %xmm4,%xmm2
+       shufps  \$0b10001100,%xmm2,%xmm4
+       xorps   %xmm4,%xmm2
+       shufps  \$0b10101010,%xmm1,%xmm1        # critical path
+       xorps   %xmm1,%xmm2
+       ret
+.size  ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+.size  __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+___
+}
+
+$code.=<<___;
+.align 64
+.Lbswap_mask:
+       .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+       .long   6,6,6,0
+.Lincrement64:
+       .long   1,0,0,0
+.Lxts_magic:
+       .long   0x87,0,1,0
+
+.asciz  "AES for Intel AES-NI, CRYPTOGAMS by <address@hidden>"
+.align 64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#              CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern        __imp_RtlVirtualUnwind
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.type  ecb_se_handler,address@hidden
+.align 16
+ecb_se_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       jmp     .Lcommon_seh_tail
+.size  ecb_se_handler,.-ecb_se_handler
+
+.type  ccm64_se_handler,address@hidden
+.align 16
+ccm64_se_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       mov     8($disp),%rsi           # disp->ImageBase
+       mov     56($disp),%r11          # disp->HandlerData
+
+       mov     0(%r11),%r10d           # HandlerData[0]
+       lea     (%rsi,%r10),%r10        # prologue label
+       cmp     %r10,%rbx               # context->Rip<prologue label
+       jb      .Lcommon_seh_tail
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # epilogue label
+       cmp     %r10,%rbx               # context->Rip>=epilogue label
+       jae     .Lcommon_seh_tail
+
+       lea     0(%rax),%rsi            # %xmm save area
+       lea     512($context),%rdi      # &context.Xmm6
+       mov     \$8,%ecx                # 4*sizeof(%xmm0)/sizeof(%rax)
+       .long   0xa548f3fc              # cld; rep movsq
+       lea     0x58(%rax),%rax         # adjust stack pointer
+
+       jmp     .Lcommon_seh_tail
+.size  ccm64_se_handler,.-ccm64_se_handler
+
+.type  ctr32_se_handler,address@hidden
+.align 16
+ctr32_se_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       lea     .Lctr32_body(%rip),%r10
+       cmp     %r10,%rbx               # context->Rip<"prologue" label
+       jb      .Lcommon_seh_tail
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       lea     .Lctr32_ret(%rip),%r10
+       cmp     %r10,%rbx
+       jae     .Lcommon_seh_tail
+
+       lea     0x20(%rax),%rsi         # %xmm save area
+       lea     512($context),%rdi      # &context.Xmm6
+       mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
+       .long   0xa548f3fc              # cld; rep movsq
+       lea     0xc8(%rax),%rax         # adjust stack pointer
+
+       jmp     .Lcommon_seh_tail
+.size  ctr32_se_handler,.-ctr32_se_handler
+
+.type  xts_se_handler,address@hidden
+.align 16
+xts_se_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       mov     8($disp),%rsi           # disp->ImageBase
+       mov     56($disp),%r11          # disp->HandlerData
+
+       mov     0(%r11),%r10d           # HandlerData[0]
+       lea     (%rsi,%r10),%r10        # prologue lable
+       cmp     %r10,%rbx               # context->Rip<prologue label
+       jb      .Lcommon_seh_tail
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # epilogue label
+       cmp     %r10,%rbx               # context->Rip>=epilogue label
+       jae     .Lcommon_seh_tail
+
+       lea     0x60(%rax),%rsi         # %xmm save area
+       lea     512($context),%rdi      # & context.Xmm6
+       mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
+       .long   0xa548f3fc              # cld; rep movsq
+       lea     0x68+160(%rax),%rax     # adjust stack pointer
+
+       jmp     .Lcommon_seh_tail
+.size  xts_se_handler,.-xts_se_handler
+___
+$code.=<<___;
+.type  cbc_se_handler,address@hidden
+.align 16
+cbc_se_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     152($context),%rax      # pull context->Rsp
+       mov     248($context),%rbx      # pull context->Rip
+
+       lea     .Lcbc_decrypt(%rip),%r10
+       cmp     %r10,%rbx               # context->Rip<"prologue" label
+       jb      .Lcommon_seh_tail
+
+       lea     .Lcbc_decrypt_body(%rip),%r10
+       cmp     %r10,%rbx               # context->Rip<cbc_decrypt_body
+       jb      .Lrestore_cbc_rax
+
+       lea     .Lcbc_ret(%rip),%r10
+       cmp     %r10,%rbx               # context->Rip>="epilogue" label
+       jae     .Lcommon_seh_tail
+
+       lea     0(%rax),%rsi            # top of stack
+       lea     512($context),%rdi      # &context.Xmm6
+       mov     \$8,%ecx                # 4*sizeof(%xmm0)/sizeof(%rax)
+       .long   0xa548f3fc              # cld; rep movsq
+       lea     0x58(%rax),%rax         # adjust stack pointer
+       jmp     .Lcommon_seh_tail
+
+.Lrestore_cbc_rax:
+       mov     120($context),%rax
+
+.Lcommon_seh_tail:
+       mov     8(%rax),%rdi
+       mov     16(%rax),%rsi
+       mov     %rax,152($context)      # restore context->Rsp
+       mov     %rsi,168($context)      # restore context->Rsi
+       mov     %rdi,176($context)      # restore context->Rdi
+
+       mov     40($disp),%rdi          # disp->ContextRecord
+       mov     $context,%rsi           # context
+       mov     \$154,%ecx              # sizeof(CONTEXT)
+       .long   0xa548f3fc              # cld; rep movsq
+
+       mov     $disp,%rsi
+       xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+       mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+       mov     0(%rsi),%r8             # arg3, disp->ControlPc
+       mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+       mov     40(%rsi),%r10           # disp->ContextRecord
+       lea     56(%rsi),%r11           # &disp->HandlerData
+       lea     24(%rsi),%r12           # &disp->EstablisherFrame
+       mov     %r10,32(%rsp)           # arg5
+       mov     %r11,40(%rsp)           # arg6
+       mov     %r12,48(%rsp)           # arg7
+       mov     %rcx,56(%rsp)           # arg8, (NULL)
+       call    *__imp_RtlVirtualUnwind(%rip)
+
+       mov     \$1,%eax                # ExceptionContinueSearch
+       add     \$64,%rsp
+       popfq
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbp
+       pop     %rbx
+       pop     %rdi
+       pop     %rsi
+       ret
+.size  cbc_se_handler,.-cbc_se_handler
+
+.section       .pdata
+.align 4
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+       .rva    .LSEH_begin_aesni_ecb_encrypt
+       .rva    .LSEH_end_aesni_ecb_encrypt
+       .rva    .LSEH_info_ecb
+
+       .rva    .LSEH_begin_aesni_ccm64_encrypt_blocks
+       .rva    .LSEH_end_aesni_ccm64_encrypt_blocks
+       .rva    .LSEH_info_ccm64_enc
+
+       .rva    .LSEH_begin_aesni_ccm64_decrypt_blocks
+       .rva    .LSEH_end_aesni_ccm64_decrypt_blocks
+       .rva    .LSEH_info_ccm64_dec
+
+       .rva    .LSEH_begin_aesni_ctr32_encrypt_blocks
+       .rva    .LSEH_end_aesni_ctr32_encrypt_blocks
+       .rva    .LSEH_info_ctr32
+
+       .rva    .LSEH_begin_aesni_xts_encrypt
+       .rva    .LSEH_end_aesni_xts_encrypt
+       .rva    .LSEH_info_xts_enc
+
+       .rva    .LSEH_begin_aesni_xts_decrypt
+       .rva    .LSEH_end_aesni_xts_decrypt
+       .rva    .LSEH_info_xts_dec
+___
+$code.=<<___;
+       .rva    .LSEH_begin_${PREFIX}_cbc_encrypt
+       .rva    .LSEH_end_${PREFIX}_cbc_encrypt
+       .rva    .LSEH_info_cbc
+
+       .rva    ${PREFIX}_set_decrypt_key
+       .rva    .LSEH_end_set_decrypt_key
+       .rva    .LSEH_info_key
+
+       .rva    ${PREFIX}_set_encrypt_key
+       .rva    .LSEH_end_set_encrypt_key
+       .rva    .LSEH_info_key
+.section       .xdata
+.align 8
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.LSEH_info_ecb:
+       .byte   9,0,0,0
+       .rva    ecb_se_handler
+.LSEH_info_ccm64_enc:
+       .byte   9,0,0,0
+       .rva    ccm64_se_handler
+       .rva    .Lccm64_enc_body,.Lccm64_enc_ret        # HandlerData[]
+.LSEH_info_ccm64_dec:
+       .byte   9,0,0,0
+       .rva    ccm64_se_handler
+       .rva    .Lccm64_dec_body,.Lccm64_dec_ret        # HandlerData[]
+.LSEH_info_ctr32:
+       .byte   9,0,0,0
+       .rva    ctr32_se_handler
+.LSEH_info_xts_enc:
+       .byte   9,0,0,0
+       .rva    xts_se_handler
+       .rva    .Lxts_enc_body,.Lxts_enc_epilogue       # HandlerData[]
+.LSEH_info_xts_dec:
+       .byte   9,0,0,0
+       .rva    xts_se_handler
+       .rva    .Lxts_dec_body,.Lxts_dec_epilogue       # HandlerData[]
+___
+$code.=<<___;
+.LSEH_info_cbc:
+       .byte   9,0,0,0
+       .rva    cbc_se_handler
+.LSEH_info_key:
+       .byte   0x01,0x04,0x01,0x00
+       .byte   0x04,0x02,0x00,0x00     # sub rsp,8
+___
+}
+
+sub rex {
+  local *opcode=shift;
+  my ($dst,$src)address@hidden;
+  my $rex=0;
+
+    $rex|=0x04                 if($dst>=8);
+    $rex|=0x01                 if($src>=8);
+    push @opcode,$rex|0x40     if($rex);
+}
+
+sub aesni {
+  my $line=shift;
+  my @opcode=(0x66);
+
+    if 
($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+       rex(address@hidden,$4,$3);
+       push @opcode,0x0f,0x3a,0xdf;
+       push @opcode,0xc0|($3&7)|(($4&7)<<3);   # ModR/M
+       my $c=$2;
+       push @opcode,$c=~/^0/?oct($c):$c;
+       return ".byte\t".join(',',@opcode);
+    }
+    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+       my %opcodelet = (
+               "aesimc" => 0xdb,
+               "aesenc" => 0xdc,       "aesenclast" => 0xdd,
+               "aesdec" => 0xde,       "aesdeclast" => 0xdf
+       );
+       return undef if (!defined($opcodelet{$1}));
+       rex(address@hidden,$3,$2);
+       push @opcode,0x0f,0x38,$opcodelet{$1};
+       push @opcode,0xc0|($2&7)|(($3&7)<<3);   # ModR/M
+       return ".byte\t".join(',',@opcode);
+    }
+    return $line;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/devel/perlasm/cbc.pl b/devel/perlasm/cbc.pl
new file mode 100644
index 0000000..6fc2510
--- /dev/null
+++ b/devel/perlasm/cbc.pl
@@ -0,0 +1,349 @@
+#!/usr/local/bin/perl
+
+# void des_ncbc_encrypt(input, output, length, schedule, ivec, enc)
+# des_cblock (*input);
+# des_cblock (*output);
+# long length;
+# des_key_schedule schedule;
+# des_cblock (*ivec);
+# int enc;
+#
+# calls 
+# des_encrypt((DES_LONG *)tin,schedule,DES_ENCRYPT);
+#
+
+#&cbc("des_ncbc_encrypt","des_encrypt",0);
+#&cbc("BF_cbc_encrypt","BF_encrypt","BF_encrypt",
+#      1,4,5,3,5,-1);
+#&cbc("des_ncbc_encrypt","des_encrypt","des_encrypt",
+#      0,4,5,3,5,-1);
+#&cbc("des_ede3_cbc_encrypt","des_encrypt3","des_decrypt3",
+#      0,6,7,3,4,5);
+#
+# When doing a cipher that needs bigendian order,
+# for encrypt, the iv is kept in bigendian form,
+# while for decrypt, it is kept in little endian.
+sub cbc
+       {
+       
local($name,$enc_func,$dec_func,$swap,$iv_off,$enc_off,$p1,$p2,$p3)address@hidden;
+       # name is the function name
+       # enc_func and dec_func and the functions to call for encrypt/decrypt
+       # swap is true if byte order needs to be reversed
+       # iv_off is parameter number for the iv 
+       # enc_off is parameter number for the encrypt/decrypt flag
+       # p1,p2,p3 are the offsets for parameters to be passed to the
+       # underlying calls.
+
+       &function_begin_B($name,"");
+       &comment("");
+
+       $in="esi";
+       $out="edi";
+       $count="ebp";
+
+       &push("ebp");
+       &push("ebx");
+       &push("esi");
+       &push("edi");
+
+       $data_off=4;
+       $data_off+=4 if ($p1 > 0);
+       $data_off+=4 if ($p2 > 0);
+       $data_off+=4 if ($p3 > 0);
+
+       &mov($count,    &wparam(2));    # length
+
+       &comment("getting iv ptr from parameter $iv_off");
+       &mov("ebx",     &wparam($iv_off));      # Get iv ptr
+
+       &mov($in,       &DWP(0,"ebx","",0));#   iv[0]
+       &mov($out,      &DWP(4,"ebx","",0));#   iv[1]
+
+       &push($out);
+       &push($in);
+       &push($out);    # used in decrypt for iv[1]
+       &push($in);     # used in decrypt for iv[0]
+
+       &mov("ebx",     "esp");         # This is the address of tin[2]
+
+       &mov($in,       &wparam(0));    # in
+       &mov($out,      &wparam(1));    # out
+
+       # We have loaded them all, how lets push things
+       &comment("getting encrypt flag from parameter $enc_off");
+       &mov("ecx",     &wparam($enc_off));     # Get enc flag
+       if ($p3 > 0)
+               {
+               &comment("get and push parameter $p3");
+               if ($enc_off != $p3)
+                       { &mov("eax",   &wparam($p3)); &push("eax"); }
+               else    { &push("ecx"); }
+               }
+       if ($p2 > 0)
+               {
+               &comment("get and push parameter $p2");
+               if ($enc_off != $p2)
+                       { &mov("eax",   &wparam($p2)); &push("eax"); }
+               else    { &push("ecx"); }
+               }
+       if ($p1 > 0)
+               {
+               &comment("get and push parameter $p1");
+               if ($enc_off != $p1)
+                       { &mov("eax",   &wparam($p1)); &push("eax"); }
+               else    { &push("ecx"); }
+               }
+       &push("ebx");           # push data/iv
+
+       &cmp("ecx",0);
+       &jz(&label("decrypt"));
+
+       &and($count,0xfffffff8);
+       &mov("eax",     &DWP($data_off,"esp","",0));    # load iv[0]
+       &mov("ebx",     &DWP($data_off+4,"esp","",0));  # load iv[1]
+
+       &jz(&label("encrypt_finish"));
+
+       #############################################################
+
+       &set_label("encrypt_loop");
+       # encrypt start 
+       # "eax" and "ebx" hold iv (or the last cipher text)
+
+       &mov("ecx",     &DWP(0,$in,"",0));      # load first 4 bytes
+       &mov("edx",     &DWP(4,$in,"",0));      # second 4 bytes
+
+       &xor("eax",     "ecx");
+       &xor("ebx",     "edx");
+
+       &bswap("eax")   if $swap;
+       &bswap("ebx")   if $swap;
+
+       &mov(&DWP($data_off,"esp","",0),        "eax"); # put in array for call
+       &mov(&DWP($data_off+4,"esp","",0),      "ebx"); #
+
+       &call($enc_func);
+
+       &mov("eax",     &DWP($data_off,"esp","",0));
+       &mov("ebx",     &DWP($data_off+4,"esp","",0));
+
+       &bswap("eax")   if $swap;
+       &bswap("ebx")   if $swap;
+
+       &mov(&DWP(0,$out,"",0),"eax");
+       &mov(&DWP(4,$out,"",0),"ebx");
+
+       # eax and ebx are the next iv.
+
+       &add($in,       8);
+       &add($out,      8);
+
+       &sub($count,    8);
+       &jnz(&label("encrypt_loop"));
+
+###################################################################3
+       &set_label("encrypt_finish");
+       &mov($count,    &wparam(2));    # length
+       &and($count,    7);
+       &jz(&label("finish"));
+       &call(&label("PIC_point"));
+&set_label("PIC_point");
+       &blindpop("edx");
+       
&lea("ecx",&DWP(&label("cbc_enc_jmp_table")."-".&label("PIC_point"),"edx"));
+       &mov($count,&DWP(0,"ecx",$count,4))
+       &add($count,"edx");
+       &xor("ecx","ecx");
+       &xor("edx","edx");
+       #&mov($count,&DWP(&label("cbc_enc_jmp_table"),"",$count,4));
+       &jmp_ptr($count);
+
+&set_label("ej7");
+       &movb(&HB("edx"),       &BP(6,$in,"",0));
+       &shl("edx",8);
+&set_label("ej6");
+       &movb(&HB("edx"),       &BP(5,$in,"",0));
+&set_label("ej5");
+       &movb(&LB("edx"),       &BP(4,$in,"",0));
+&set_label("ej4");
+       &mov("ecx",             &DWP(0,$in,"",0));
+       &jmp(&label("ejend"));
+&set_label("ej3");
+       &movb(&HB("ecx"),       &BP(2,$in,"",0));
+       &shl("ecx",8);
+&set_label("ej2");
+       &movb(&HB("ecx"),       &BP(1,$in,"",0));
+&set_label("ej1");
+       &movb(&LB("ecx"),       &BP(0,$in,"",0));
+&set_label("ejend");
+
+       &xor("eax",     "ecx");
+       &xor("ebx",     "edx");
+
+       &bswap("eax")   if $swap;
+       &bswap("ebx")   if $swap;
+
+       &mov(&DWP($data_off,"esp","",0),        "eax"); # put in array for call
+       &mov(&DWP($data_off+4,"esp","",0),      "ebx"); #
+
+       &call($enc_func);
+
+       &mov("eax",     &DWP($data_off,"esp","",0));
+       &mov("ebx",     &DWP($data_off+4,"esp","",0));
+
+       &bswap("eax")   if $swap;
+       &bswap("ebx")   if $swap;
+
+       &mov(&DWP(0,$out,"",0),"eax");
+       &mov(&DWP(4,$out,"",0),"ebx");
+
+       &jmp(&label("finish"));
+
+       #############################################################
+       #############################################################
+       &set_label("decrypt",1);
+       # decrypt start 
+       &and($count,0xfffffff8);
+       # The next 2 instructions are only for if the jz is taken
+       &mov("eax",     &DWP($data_off+8,"esp","",0));  # get iv[0]
+       &mov("ebx",     &DWP($data_off+12,"esp","",0)); # get iv[1]
+       &jz(&label("decrypt_finish"));
+
+       &set_label("decrypt_loop");
+       &mov("eax",     &DWP(0,$in,"",0));      # load first 4 bytes
+       &mov("ebx",     &DWP(4,$in,"",0));      # second 4 bytes
+
+       &bswap("eax")   if $swap;
+       &bswap("ebx")   if $swap;
+
+       &mov(&DWP($data_off,"esp","",0),        "eax"); # put back
+       &mov(&DWP($data_off+4,"esp","",0),      "ebx"); #
+
+       &call($dec_func);
+
+       &mov("eax",     &DWP($data_off,"esp","",0));    # get return
+       &mov("ebx",     &DWP($data_off+4,"esp","",0));  #
+
+       &bswap("eax")   if $swap;
+       &bswap("ebx")   if $swap;
+
+       &mov("ecx",     &DWP($data_off+8,"esp","",0));  # get iv[0]
+       &mov("edx",     &DWP($data_off+12,"esp","",0)); # get iv[1]
+
+       &xor("ecx",     "eax");
+       &xor("edx",     "ebx");
+
+       &mov("eax",     &DWP(0,$in,"",0));      # get old cipher text,
+       &mov("ebx",     &DWP(4,$in,"",0));      # next iv actually
+
+       &mov(&DWP(0,$out,"",0),"ecx");
+       &mov(&DWP(4,$out,"",0),"edx");
+
+       &mov(&DWP($data_off+8,"esp","",0),      "eax"); # save iv
+       &mov(&DWP($data_off+12,"esp","",0),     "ebx"); #
+
+       &add($in,       8);
+       &add($out,      8);
+
+       &sub($count,    8);
+       &jnz(&label("decrypt_loop"));
+############################ ENDIT #######################3
+       &set_label("decrypt_finish");
+       &mov($count,    &wparam(2));    # length
+       &and($count,    7);
+       &jz(&label("finish"));
+
+       &mov("eax",     &DWP(0,$in,"",0));      # load first 4 bytes
+       &mov("ebx",     &DWP(4,$in,"",0));      # second 4 bytes
+
+       &bswap("eax")   if $swap;
+       &bswap("ebx")   if $swap;
+
+       &mov(&DWP($data_off,"esp","",0),        "eax"); # put back
+       &mov(&DWP($data_off+4,"esp","",0),      "ebx"); #
+
+       &call($dec_func);
+
+       &mov("eax",     &DWP($data_off,"esp","",0));    # get return
+       &mov("ebx",     &DWP($data_off+4,"esp","",0));  #
+
+       &bswap("eax")   if $swap;
+       &bswap("ebx")   if $swap;
+
+       &mov("ecx",     &DWP($data_off+8,"esp","",0));  # get iv[0]
+       &mov("edx",     &DWP($data_off+12,"esp","",0)); # get iv[1]
+
+       &xor("ecx",     "eax");
+       &xor("edx",     "ebx");
+
+       # this is for when we exit
+       &mov("eax",     &DWP(0,$in,"",0));      # get old cipher text,
+       &mov("ebx",     &DWP(4,$in,"",0));      # next iv actually
+
+&set_label("dj7");
+       &rotr("edx",    16);
+       &movb(&BP(6,$out,"",0), &LB("edx"));
+       &shr("edx",16);
+&set_label("dj6");
+       &movb(&BP(5,$out,"",0), &HB("edx"));
+&set_label("dj5");
+       &movb(&BP(4,$out,"",0), &LB("edx"));
+&set_label("dj4");
+       &mov(&DWP(0,$out,"",0), "ecx");
+       &jmp(&label("djend"));
+&set_label("dj3");
+       &rotr("ecx",    16);
+       &movb(&BP(2,$out,"",0), &LB("ecx"));
+       &shl("ecx",16);
+&set_label("dj2");
+       &movb(&BP(1,$in,"",0),  &HB("ecx"));
+&set_label("dj1");
+       &movb(&BP(0,$in,"",0),  &LB("ecx"));
+&set_label("djend");
+
+       # final iv is still in eax:ebx
+       &jmp(&label("finish"));
+
+
+############################ FINISH #######################3
+       &set_label("finish",1);
+       &mov("ecx",     &wparam($iv_off));      # Get iv ptr
+
+       #################################################
+       $total=16+4;
+       $total+=4 if ($p1 > 0);
+       $total+=4 if ($p2 > 0);
+       $total+=4 if ($p3 > 0);
+       &add("esp",$total);
+
+       &mov(&DWP(0,"ecx","",0),        "eax"); # save iv
+       &mov(&DWP(4,"ecx","",0),        "ebx"); # save iv
+
+       &function_end_A($name);
+
+       &align(64);
+       &set_label("cbc_enc_jmp_table");
+       &data_word("0");
+       &data_word(&label("ej1")."-".&label("PIC_point"));
+       &data_word(&label("ej2")."-".&label("PIC_point"));
+       &data_word(&label("ej3")."-".&label("PIC_point"));
+       &data_word(&label("ej4")."-".&label("PIC_point"));
+       &data_word(&label("ej5")."-".&label("PIC_point"));
+       &data_word(&label("ej6")."-".&label("PIC_point"));
+       &data_word(&label("ej7")."-".&label("PIC_point"));
+       # not used
+       #&set_label("cbc_dec_jmp_table",1);
+       #&data_word("0");
+       #&data_word(&label("dj1")."-".&label("PIC_point"));
+       #&data_word(&label("dj2")."-".&label("PIC_point"));
+       #&data_word(&label("dj3")."-".&label("PIC_point"));
+       #&data_word(&label("dj4")."-".&label("PIC_point"));
+       #&data_word(&label("dj5")."-".&label("PIC_point"));
+       #&data_word(&label("dj6")."-".&label("PIC_point"));
+       #&data_word(&label("dj7")."-".&label("PIC_point"));
+       &align(64);
+
+       &function_end_B($name);
+       
+       }
+
+1;
diff --git a/devel/perlasm/cpuid-x86.pl b/devel/perlasm/cpuid-x86.pl
new file mode 100644
index 0000000..50def40
--- /dev/null
+++ b/devel/perlasm/cpuid-x86.pl
@@ -0,0 +1,57 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Nikos Mavrogiannopoulos
+# Placed under the LGPL
+# ====================================================================
+#
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../crypto/perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+&function_begin_B("_gnutls_cpuid");
+       &push   ("ebp");
+       &mov    ("ebp", "esp");
+       &sub    ("esp", 12);
+       &mov    (&DWP(0,"esp"), "ebx");
+       &mov    ("eax",&DWP(8,"ebp"));
+       &mov    (&DWP(4,"esp"), "esi");
+       &mov    (&DWP(8,"esp"), "edi");
+       &push   ("ebx");
+       &cpuid  ();
+       &mov    ("edi", "ebx");
+       &pop    ("ebx");
+       &mov    ("esi","edx");
+       &mov    ("edx",&DWP(12,"ebp"));
+       &mov    (&DWP(0,"edx"), "eax");
+       &mov    ("eax",&DWP(16,"ebp"));
+       &mov    (&DWP(0,"eax"), "edi");
+       &mov    ("eax",&DWP(20,"ebp"));
+       &mov    (&DWP(0,"eax"), "ecx");
+       &mov    ("eax",&DWP(24,"ebp"));
+       &mov    (&DWP(0,"eax"), "esi");
+       &mov    ("ebx",&DWP(0,"esp"));
+       &mov    ("esi",&DWP(4,"esp"));
+       &mov    ("edi",&DWP(8,"esp"));
+       &mov    ("esp","ebp");
+       &pop    ("ebp");
+       &ret    ();
+&function_end_B("_gnutls_cpuid");
+
+&function_begin_B("_gnutls_have_cpuid");
+       &pushf  ();
+       &pop    ("eax");
+       &or     ("eax",0x200000);
+       &push   ("eax");
+       &popf   ();
+       &pushf  ();
+       &pop    ("eax");
+       &and     ("eax",0x200000);
+       &ret    ();
+&function_end_B("_gnutls_have_cpuid");
+
+&asciz("CPUID for x86");
+&asm_finish();
diff --git a/devel/perlasm/cpuid-x86_64.pl b/devel/perlasm/cpuid-x86_64.pl
new file mode 100644
index 0000000..b821a49
--- /dev/null
+++ b/devel/perlasm/cpuid-x86_64.pl
@@ -0,0 +1,69 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Nikos Mavrogiannopoulos
+# Based on e_padlock-x86_64
+# ====================================================================
+#
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+$code=".text\n";
+
+($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
+                                 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+
+
+$code.=<<___;
+.globl _gnutls_cpuid
+.type _gnutls_cpuid,address@hidden
+.align 16
+_gnutls_cpuid:
+       pushq   %rbp
+       movq    %rsp, %rbp
+       pushq   %rbx
+       movl    %edi, -12(%rbp)
+       movq    %rsi, -24(%rbp)
+       movq    %rdx, -32(%rbp)
+       movq    %rcx, -40(%rbp)
+       movq    %r8, -48(%rbp)
+       movl    -12(%rbp), %eax
+       movl    %eax, -60(%rbp)
+       movl    -60(%rbp), %eax
+       cpuid
+       movl    %edx, -56(%rbp)
+       movl    %ecx, %esi
+       movl    %eax, -52(%rbp)
+       movq    -24(%rbp), %rax
+       movl    -52(%rbp), %edx
+       movl    %edx, (%rax)
+       movq    -32(%rbp), %rax
+       movl    %ebx, (%rax)
+       movq    -40(%rbp), %rax
+       movl    %esi, (%rax)
+       movq    -48(%rbp), %rax
+       movl    -56(%rbp), %ecx
+       movl    %ecx, (%rax)
+       popq    %rbx
+       leave
+       ret
+.size _gnutls_cpuid,.-_gnutls_cpuid
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
+
diff --git a/devel/perlasm/e_padlock-x86.pl b/devel/perlasm/e_padlock-x86.pl
new file mode 100644
index 0000000..7a52528
--- /dev/null
+++ b/devel/perlasm/e_padlock-x86.pl
@@ -0,0 +1,548 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <address@hidden> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# September 2011
+#
+# Assembler helpers for Padlock engine. Compared to original engine
+# version relying on inline assembler and compiled with gcc 3.4.6 it
+# was measured to provide ~100% improvement on misaligned data in ECB
+# mode and ~75% in CBC mode. For aligned data improvement can be
+# observed for short inputs only, e.g. 45% for 64-byte messages in
+# ECB mode, 20% in CBC. Difference in performance for aligned vs.
+# misaligned data depends on misalignment and is either ~1.8x or 2.9x.
+# These are approximately same factors as for hardware support, so
+# there is little reason to rely on the latter. On the contrary, it
+# might actually hurt performance in mixture of aligned and misaligned
+# buffers, because a) if you choose to flip 'align' flag in control
+# word on per-buffer basis, then you'd have to reload key context,
+# which incurs penalty; b) if you choose to set 'align' flag
+# permanently, it limits performance even for aligned data to ~1/2.
+# All above mentioned results were collected on 1.5GHz C7. Nano on the
+# other hand handles unaligned data more gracefully. Depending on
+# algorithm and how unaligned data is, hardware can be up to 70% more
+# efficient than below software alignment procedures, nor does 'align'
+# flag have affect on aligned performance [if has any meaning at all].
+# Therefore suggestion is to unconditionally set 'align' flag on Nano
+# for optimal performance.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../crypto/perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+%PADLOCK_MARGIN=(ecb=>128, cbc=>64); # prefetch errata
+$PADLOCK_CHUNK=512;    # Must be a power of 2 larger than 16
+
+$ctx="edx";
+$out="edi";
+$inp="esi";
+$len="ecx";
+$chunk="ebx";
+
+&function_begin_B("padlock_capability");
+       &push   ("ebx");
+       &pushf  ();
+       &pop    ("eax");
+       &mov    ("ecx","eax");
+       &xor    ("eax",1<<21);
+       &push   ("eax");
+       &popf   ();
+       &pushf  ();
+       &pop    ("eax");
+       &xor    ("ecx","eax");
+       &xor    ("eax","eax");
+       &bt     ("ecx",21);
+       &jnc    (&label("noluck"));
+       &cpuid  ();
+       &xor    ("eax","eax");
+       &cmp    ("ebx","0x".unpack("H*",'tneC'));
+       &jne    (&label("noluck"));
+       &cmp    ("edx","0x".unpack("H*",'Hrua'));
+       &jne    (&label("noluck"));
+       &cmp    ("ecx","0x".unpack("H*",'slua'));
+       &jne    (&label("noluck"));
+       &mov    ("eax",0xC0000000);
+       &cpuid  ();
+       &mov    ("edx","eax");
+       &xor    ("eax","eax");
+       &cmp    ("edx",0xC0000001);
+       &jb     (&label("noluck"));
+       &mov    ("eax",1);
+       &cpuid  ();
+       &or     ("eax",0x0f);
+       &xor    ("ebx","ebx");
+       &and    ("eax",0x0fff);
+       &cmp    ("eax",0x06ff);         # check for Nano
+       &sete   ("bl");
+       &mov    ("eax",0xC0000001);
+       &push   ("ebx");
+       &cpuid  ();
+       &pop    ("ebx");
+       &mov    ("eax","edx");
+       &shl    ("ebx",4);              # bit#4 denotes Nano
+       &and    ("eax",0xffffffef);
+       &or     ("eax","ebx")
+&set_label("noluck");
+       &pop    ("ebx");
+       &ret    ();
+&function_end_B("padlock_capability")
+
+&function_begin_B("padlock_key_bswap");
+       &mov    ("edx",&wparam(0));
+       &mov    ("ecx",&DWP(240,"edx"));
+&set_label("bswap_loop");
+       &mov    ("eax",&DWP(0,"edx"));
+       &bswap  ("eax");
+       &mov    (&DWP(0,"edx"),"eax");
+       &lea    ("edx",&DWP(4,"edx"));
+       &sub    ("ecx",1);
+       &jnz    (&label("bswap_loop"));
+       &ret    ();
+&function_end_B("padlock_key_bswap");
+
+# This is heuristic key context tracing. At first one
+# believes that one should use atomic swap instructions,
+# but it's not actually necessary. Point is that if
+# padlock_saved_context was changed by another thread
+# after we've read it and before we compare it with ctx,
+# our key *shall* be reloaded upon thread context switch
+# and we are therefore set in either case...
+&static_label("padlock_saved_context");
+
+&function_begin_B("padlock_verify_context");
+       &mov    ($ctx,&wparam(0));
+       &lea    ("eax",($::win32 or $::coff) ? 
&DWP(&label("padlock_saved_context")) :
+                      
&DWP(&label("padlock_saved_context")."-".&label("verify_pic_point")));
+       &pushf  ();
+       &call   ("_padlock_verify_ctx");
+&set_label("verify_pic_point");
+       &lea    ("esp",&DWP(4,"esp"));
+       &ret    ();
+&function_end_B("padlock_verify_context");
+
+&function_begin_B("_padlock_verify_ctx");
+       &add    ("eax",&DWP(0,"esp")) if(!($::win32 or $::coff));# 
&padlock_saved_context
+       &bt     (&DWP(4,"esp"),30);             # eflags
+       &jnc    (&label("verified"));
+       &cmp    ($ctx,&DWP(0,"eax"));
+       &je     (&label("verified"));
+       &pushf  ();
+       &popf   ();
+&set_label("verified");
+       &mov    (&DWP(0,"eax"),$ctx);
+       &ret    ();
+&function_end_B("_padlock_verify_ctx");
+
+&function_begin_B("padlock_reload_key");
+       &pushf  ();
+       &popf   ();
+       &ret    ();
+&function_end_B("padlock_reload_key");
+
+&function_begin_B("padlock_aes_block");
+       &push   ("edi");
+       &push   ("esi");
+       &push   ("ebx");
+       &mov    ($out,&wparam(0));              # must be 16-byte aligned
+       &mov    ($inp,&wparam(1));              # must be 16-byte aligned
+       &mov    ($ctx,&wparam(2));
+       &mov    ($len,1);
+       &lea    ("ebx",&DWP(32,$ctx));          # key
+       &lea    ($ctx,&DWP(16,$ctx));           # control word
+       &data_byte(0xf3,0x0f,0xa7,0xc8);        # rep xcryptecb
+       &pop    ("ebx");
+       &pop    ("esi");
+       &pop    ("edi");
+       &ret    ();
+&function_end_B("padlock_aes_block");
+
+sub generate_mode {
+my ($mode,$opcode) = @_;
+# int padlock_$mode_encrypt(void *out, const void *inp,
+#              struct padlock_cipher_data *ctx, size_t len);
+&function_begin("padlock_${mode}_encrypt");
+       &mov    ($out,&wparam(0));
+       &mov    ($inp,&wparam(1));
+       &mov    ($ctx,&wparam(2));
+       &mov    ($len,&wparam(3));
+       &test   ($ctx,15);
+       &jnz    (&label("${mode}_abort"));
+       &test   ($len,15);
+       &jnz    (&label("${mode}_abort"));
+       &lea    ("eax",($::win32 or $::coff) ? 
&DWP(&label("padlock_saved_context")) :
+                      
&DWP(&label("padlock_saved_context")."-".&label("${mode}_pic_point")));
+       &pushf  ();
+       &cld    ();
+       &call   ("_padlock_verify_ctx");
+&set_label("${mode}_pic_point");
+       &lea    ($ctx,&DWP(16,$ctx));   # control word
+       &xor    ("eax","eax");
+                                       if ($mode eq "ctr32") {
+       &movq   ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter
+                                       } else {
+       &xor    ("ebx","ebx");
+    if ($PADLOCK_MARGIN{$mode}) {
+       &cmp    ($len,$PADLOCK_MARGIN{$mode});
+       &jbe    (&label("${mode}_short"));
+    }
+       &test   (&DWP(0,$ctx),1<<5);    # align bit in control word
+       &jnz    (&label("${mode}_aligned"));
+       &test   ($out,0x0f);
+       &setz   ("al");                 # !out_misaligned
+       &test   ($inp,0x0f);
+       &setz   ("bl");                 # !inp_misaligned
+       &test   ("eax","ebx");
+       &jnz    (&label("${mode}_aligned"));
+       &neg    ("eax");
+                                       }
+       &mov    ($chunk,$PADLOCK_CHUNK);
+       &not    ("eax");                # out_misaligned?-1:0
+       &lea    ("ebp",&DWP(-24,"esp"));
+       &cmp    ($len,$chunk);
+       &cmovc  ($chunk,$len);          # 
chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
+       &and    ("eax",$chunk);         # out_misaligned?chunk:0
+       &mov    ($chunk,$len);
+       &neg    ("eax");
+       &and    ($chunk,$PADLOCK_CHUNK-1);      # chunk=len%PADLOCK_CHUNK
+       &lea    ("esp",&DWP(0,"eax","ebp"));    # alloca
+       &and    ("esp",-16);
+       &jmp    (&label("${mode}_loop"));
+
+&set_label("${mode}_loop",16);
+       &mov    (&DWP(0,"ebp"),$out);           # save parameters
+       &mov    (&DWP(4,"ebp"),$inp);
+       &mov    (&DWP(8,"ebp"),$len);
+       &mov    ($len,$chunk);
+       &mov    (&DWP(12,"ebp"),$chunk);        # chunk
+                                               if ($mode eq "ctr32") {
+       &mov    ("ecx",&DWP(-4,$ctx));
+       &xor    ($out,$out);
+       &mov    ("eax",&DWP(-8,$ctx));          # borrow $len
+&set_label("${mode}_prepare");
+       &mov    (&DWP(12,"esp",$out),"ecx");
+       &bswap  ("ecx");
+       &movq   (&QWP(0,"esp",$out),"mm0");
+       &inc    ("ecx");
+       &mov    (&DWP(8,"esp",$out),"eax");
+       &bswap  ("ecx");
+       &lea    ($out,&DWP(16,$out));
+       &cmp    ($out,$chunk);
+       &jb     (&label("${mode}_prepare"));
+
+       &mov    (&DWP(-4,$ctx),"ecx");
+       &lea    ($inp,&DWP(0,"esp"));
+       &lea    ($out,&DWP(0,"esp"));
+       &mov    ($len,$chunk);
+                                               } else {
+       &test   ($out,0x0f);                    # out_misaligned
+       &cmovnz ($out,"esp");
+       &test   ($inp,0x0f);                    # inp_misaligned
+       &jz     (&label("${mode}_inp_aligned"));
+       &shr    ($len,2);
+       &data_byte(0xf3,0xa5);                  # rep movsl
+       &sub    ($out,$chunk);
+       &mov    ($len,$chunk);
+       &mov    ($inp,$out);
+&set_label("${mode}_inp_aligned");
+                                               }
+       &lea    ("eax",&DWP(-16,$ctx));         # ivp
+       &lea    ("ebx",&DWP(16,$ctx));          # key
+       &shr    ($len,4);                       # len/=AES_BLOCK_SIZE
+       &data_byte(0xf3,0x0f,0xa7,$opcode);     # rep xcrypt*
+                                               if ($mode !~ /ecb|ctr/) {
+       &movaps ("xmm0",&QWP(0,"eax"));
+       &movaps (&QWP(-16,$ctx),"xmm0");        # copy [or refresh] iv
+                                               }
+       &mov    ($out,&DWP(0,"ebp"));           # restore parameters
+       &mov    ($chunk,&DWP(12,"ebp"));
+                                               if ($mode eq "ctr32") {
+       &mov    ($inp,&DWP(4,"ebp"));
+       &xor    ($len,$len);
+&set_label("${mode}_xor");
+       &movups ("xmm1",&QWP(0,$inp,$len));
+       &lea    ($len,&DWP(16,$len));
+       &pxor   ("xmm1",&QWP(-16,"esp",$len));
+       &movups (&QWP(-16,$out,$len),"xmm1");
+       &cmp    ($len,$chunk);
+       &jb     (&label("${mode}_xor"));
+                                               } else {
+       &test   ($out,0x0f);
+       &jz     (&label("${mode}_out_aligned"));
+       &mov    ($len,$chunk);
+       &shr    ($len,2);
+       &lea    ($inp,&DWP(0,"esp"));
+       &data_byte(0xf3,0xa5);                  # rep movsl
+       &sub    ($out,$chunk);
+&set_label("${mode}_out_aligned");
+       &mov    ($inp,&DWP(4,"ebp"));
+                                               }
+       &mov    ($len,&DWP(8,"ebp"));
+       &add    ($out,$chunk);
+       &add    ($inp,$chunk);
+       &sub    ($len,$chunk);
+       &mov    ($chunk,$PADLOCK_CHUNK);
+       &jnz    (&label("${mode}_loop"));
+                                               if ($mode ne "ctr32") {
+       &cmp    ("esp","ebp");
+       &je     (&label("${mode}_done"));
+                                               }
+       &pxor   ("xmm0","xmm0");
+       &lea    ("eax",&DWP(0,"esp"));
+&set_label("${mode}_bzero");
+       &movaps (&QWP(0,"eax"),"xmm0");
+       &lea    ("eax",&DWP(16,"eax"));
+       &cmp    ("ebp","eax");
+       &ja     (&label("${mode}_bzero"));
+
+&set_label("${mode}_done");
+       &lea    ("esp",&DWP(24,"ebp"));
+                                               if ($mode ne "ctr32") {
+       &jmp    (&label("${mode}_exit"));
+
+&set_label("${mode}_short",16);
+       &xor    ("eax","eax");
+       &lea    ("ebp",&DWP(-24,"esp"));
+       &sub    ("eax",$len);
+       &lea    ("esp",&DWP(0,"eax","ebp"));
+       &and    ("esp",-16);
+       &xor    ($chunk,$chunk);
+&set_label("${mode}_short_copy");
+       &movups ("xmm0",&QWP(0,$inp,$chunk));
+       &lea    ($chunk,&DWP(16,$chunk));
+       &cmp    ($len,$chunk);
+       &movaps (&QWP(-16,"esp",$chunk),"xmm0");
+       &ja     (&label("${mode}_short_copy"));
+       &mov    ($inp,"esp");
+       &mov    ($chunk,$len);
+       &jmp    (&label("${mode}_loop"));
+
+&set_label("${mode}_aligned",16);
+       &lea    ("eax",&DWP(-16,$ctx));         # ivp
+       &lea    ("ebx",&DWP(16,$ctx));          # key
+       &shr    ($len,4);                       # len/=AES_BLOCK_SIZE
+       &data_byte(0xf3,0x0f,0xa7,$opcode);     # rep xcrypt*
+                                               if ($mode ne "ecb") {
+       &movaps ("xmm0",&QWP(0,"eax"));
+       &movaps (&QWP(-16,$ctx),"xmm0");        # copy [or refresh] iv
+                                               }
+&set_label("${mode}_exit");                    }
+       &mov    ("eax",1);
+       &lea    ("esp",&DWP(4,"esp"));          # popf
+       &emms   ()                              if ($mode eq "ctr32");
+&set_label("${mode}_abort");
+&function_end("padlock_${mode}_encrypt");
+}
+
+&generate_mode("ecb",0xc8);
+&generate_mode("cbc",0xd0);
+#&generate_mode("cfb",0xe0);
+#&generate_mode("ofb",0xe8);
+#&generate_mode("ctr32",0xc8); # yes, it implements own CTR with ECB opcode,
+                               # because hardware CTR was introduced later
+                               # and even has errata on certain C7 stepping.
+                               # own implementation *always* works, though
+                               # ~15% slower than dedicated hardware...
+
+&function_begin_B("padlock_xstore");
+       &push   ("edi");
+       &mov    ("edi",&wparam(0));
+       &mov    ("edx",&wparam(1));
+       &data_byte(0x0f,0xa7,0xc0);             # xstore
+       &pop    ("edi");
+       &ret    ();
+&function_end_B("padlock_xstore");
+
+&function_begin_B("_win32_segv_handler");
+       &mov    ("eax",1);                      # ExceptionContinueSearch
+       &mov    ("edx",&wparam(0));             # *ExceptionRecord
+       &mov    ("ecx",&wparam(2));             # *ContextRecord
+       &cmp    (&DWP(0,"edx"),0xC0000005)      # 
ExceptionRecord->ExceptionCode == STATUS_ACCESS_VIOLATION
+       &jne    (&label("ret"));
+       &add    (&DWP(184,"ecx"),4);            # skip over rep sha*
+       &mov    ("eax",0);                      # ExceptionContinueExecution
+&set_label("ret");
+       &ret    ();
+&function_end_B("_win32_segv_handler");
+&safeseh("_win32_segv_handler")                        if ($::win32);
+
+&function_begin_B("padlock_sha1_oneshot");
+       &push   ("edi");
+       &push   ("esi");
+       &xor    ("eax","eax");
+       &mov    ("edi",&wparam(0));
+       &mov    ("esi",&wparam(1));
+       &mov    ("ecx",&wparam(2));
+    if ($::win32 or $::coff) {
+       &push   (&::islabel("_win32_segv_handler"));
+       &data_byte(0x64,0xff,0x30);             # push  %fs:(%eax)
+       &data_byte(0x64,0x89,0x20);             # mov   %esp,%fs:(%eax)
+    }
+       &mov    ("edx","esp");                  # put aside %esp
+       &add    ("esp",-128);                   # 32 is enough but spec says 128
+       &movups ("xmm0",&QWP(0,"edi"));         # copy-in context
+       &and    ("esp",-16);
+       &mov    ("eax",&DWP(16,"edi"));
+       &movaps (&QWP(0,"esp"),"xmm0");
+       &mov    ("edi","esp");
+       &mov    (&DWP(16,"esp"),"eax");
+       &xor    ("eax","eax");
+       &data_byte(0xf3,0x0f,0xa6,0xc8);        # rep xsha1
+       &movaps ("xmm0",&QWP(0,"esp"));
+       &mov    ("eax",&DWP(16,"esp"));
+       &mov    ("esp","edx");                  # restore %esp
+    if ($::win32 or $::coff) {
+       &data_byte(0x64,0x8f,0x05,0,0,0,0);     # pop   %fs:0
+       &lea    ("esp",&DWP(4,"esp"));
+    }
+       &mov    ("edi",&wparam(0));
+       &movups (&QWP(0,"edi"),"xmm0");         # copy-out context
+       &mov    (&DWP(16,"edi"),"eax");
+       &pop    ("esi");
+       &pop    ("edi");
+       &ret    ();
+&function_end_B("padlock_sha1_oneshot");
+
+&function_begin_B("padlock_sha1_blocks");
+       &push   ("edi");
+       &push   ("esi");
+       &mov    ("edi",&wparam(0));
+       &mov    ("esi",&wparam(1));
+       &mov    ("edx","esp");                  # put aside %esp
+       &mov    ("ecx",&wparam(2));
+       &add    ("esp",-128);
+       &movups ("xmm0",&QWP(0,"edi"));         # copy-in context
+       &and    ("esp",-16);
+       &mov    ("eax",&DWP(16,"edi"));
+       &movaps (&QWP(0,"esp"),"xmm0");
+       &mov    ("edi","esp");
+       &mov    (&DWP(16,"esp"),"eax");
+       &mov    ("eax",-1);
+       &data_byte(0xf3,0x0f,0xa6,0xc8);        # rep xsha1
+       &movaps ("xmm0",&QWP(0,"esp"));
+       &mov    ("eax",&DWP(16,"esp"));
+       &mov    ("esp","edx");                  # restore %esp
+       &mov    ("edi",&wparam(0));
+       &movups (&QWP(0,"edi"),"xmm0");         # copy-out context
+       &mov    (&DWP(16,"edi"),"eax");
+       &pop    ("esi");
+       &pop    ("edi");
+       &ret    ();
+&function_end_B("padlock_sha1_blocks");
+
+&function_begin_B("padlock_sha256_oneshot");
+       &push   ("edi");
+       &push   ("esi");
+       &xor    ("eax","eax");
+       &mov    ("edi",&wparam(0));
+       &mov    ("esi",&wparam(1));
+       &mov    ("ecx",&wparam(2));
+    if ($::win32 or $::coff) {
+       &push   (&::islabel("_win32_segv_handler"));
+       &data_byte(0x64,0xff,0x30);             # push  %fs:(%eax)
+       &data_byte(0x64,0x89,0x20);             # mov   %esp,%fs:(%eax)
+    }
+       &mov    ("edx","esp");                  # put aside %esp
+       &add    ("esp",-128);
+       &movups ("xmm0",&QWP(0,"edi"));         # copy-in context
+       &and    ("esp",-16);
+       &movups ("xmm1",&QWP(16,"edi"));
+       &movaps (&QWP(0,"esp"),"xmm0");
+       &mov    ("edi","esp");
+       &movaps (&QWP(16,"esp"),"xmm1");
+       &xor    ("eax","eax");
+       &data_byte(0xf3,0x0f,0xa6,0xd0);        # rep xsha256
+       &movaps ("xmm0",&QWP(0,"esp"));
+       &movaps ("xmm1",&QWP(16,"esp"));
+       &mov    ("esp","edx");                  # restore %esp
+    if ($::win32 or $::coff) {
+       &data_byte(0x64,0x8f,0x05,0,0,0,0);     # pop   %fs:0
+       &lea    ("esp",&DWP(4,"esp"));
+    }
+       &mov    ("edi",&wparam(0));
+       &movups (&QWP(0,"edi"),"xmm0");         # copy-out context
+       &movups (&QWP(16,"edi"),"xmm1");
+       &pop    ("esi");
+       &pop    ("edi");
+       &ret    ();
+&function_end_B("padlock_sha256_oneshot");
+
+&function_begin_B("padlock_sha256_blocks");
+       &push   ("edi");
+       &push   ("esi");
+       &mov    ("edi",&wparam(0));
+       &mov    ("esi",&wparam(1));
+       &mov    ("ecx",&wparam(2));
+       &mov    ("edx","esp");                  # put aside %esp
+       &add    ("esp",-128);
+       &movups ("xmm0",&QWP(0,"edi"));         # copy-in context
+       &and    ("esp",-16);
+       &movups ("xmm1",&QWP(16,"edi"));
+       &movaps (&QWP(0,"esp"),"xmm0");
+       &mov    ("edi","esp");
+       &movaps (&QWP(16,"esp"),"xmm1");
+       &mov    ("eax",-1);
+       &data_byte(0xf3,0x0f,0xa6,0xd0);        # rep xsha256
+       &movaps ("xmm0",&QWP(0,"esp"));
+       &movaps ("xmm1",&QWP(16,"esp"));
+       &mov    ("esp","edx");                  # restore %esp
+       &mov    ("edi",&wparam(0));
+       &movups (&QWP(0,"edi"),"xmm0");         # copy-out context
+       &movups (&QWP(16,"edi"),"xmm1");
+       &pop    ("esi");
+       &pop    ("edi");
+       &ret    ();
+&function_end_B("padlock_sha256_blocks");
+
+&function_begin_B("padlock_sha512_blocks");
+       &push   ("edi");
+       &push   ("esi");
+       &mov    ("edi",&wparam(0));
+       &mov    ("esi",&wparam(1));
+       &mov    ("ecx",&wparam(2));
+       &mov    ("edx","esp");                  # put aside %esp
+       &add    ("esp",-128);
+       &movups ("xmm0",&QWP(0,"edi"));         # copy-in context
+       &and    ("esp",-16);
+       &movups ("xmm1",&QWP(16,"edi"));
+       &movups ("xmm2",&QWP(32,"edi"));
+       &movups ("xmm3",&QWP(48,"edi"));
+       &movaps (&QWP(0,"esp"),"xmm0");
+       &mov    ("edi","esp");
+       &movaps (&QWP(16,"esp"),"xmm1");
+       &movaps (&QWP(32,"esp"),"xmm2");
+       &movaps (&QWP(48,"esp"),"xmm3");
+       &data_byte(0xf3,0x0f,0xa6,0xe0);        # rep xsha512
+       &movaps ("xmm0",&QWP(0,"esp"));
+       &movaps ("xmm1",&QWP(16,"esp"));
+       &movaps ("xmm2",&QWP(32,"esp"));
+       &movaps ("xmm3",&QWP(48,"esp"));
+       &mov    ("esp","edx");                  # restore %esp
+       &mov    ("edi",&wparam(0));
+       &movups (&QWP(0,"edi"),"xmm0");         # copy-out context
+       &movups (&QWP(16,"edi"),"xmm1");
+       &movups (&QWP(32,"edi"),"xmm2");
+       &movups (&QWP(48,"edi"),"xmm3");
+       &pop    ("esi");
+       &pop    ("edi");
+       &ret    ();
+&function_end_B("padlock_sha512_blocks");
+
+&asciz ("VIA Padlock x86 module, CRYPTOGAMS by <address@hidden>");
+&align (16);
+
+&dataseg();
+# Essentially this variable belongs in thread local storage.
+# Having this variable global on the other hand can only cause
+# few bogus key reloads [if any at all on signle-CPU system],
+# so we accept the penalty...
+&set_label("padlock_saved_context",4);
+&data_word(0);
+
+&asm_finish();
diff --git a/devel/perlasm/e_padlock-x86_64.pl 
b/devel/perlasm/e_padlock-x86_64.pl
new file mode 100644
index 0000000..cbffb9d
--- /dev/null
+++ b/devel/perlasm/e_padlock-x86_64.pl
@@ -0,0 +1,498 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <address@hidden> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# September 2011
+#
+# Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
+# details.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+$code=".text\n";
+
+%PADLOCK_MARGIN=(ecb=>128, cbc=>64, ctr32=>64);        # prefetch errata
+$PADLOCK_CHUNK=512;    # Must be a power of 2 between 32 and 2^20
+
+$ctx="%rdx";
+$out="%rdi";
+$inp="%rsi";
+$len="%rcx";
+$chunk="%rbx";
+
+($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
+                                 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+
+$code.=<<___;
+.globl padlock_capability
+.type  padlock_capability,address@hidden
+.align 16
+padlock_capability:
+       mov     %rbx,%r8
+       xor     %eax,%eax
+       cpuid
+       xor     %eax,%eax
+       cmp     \$`"0x".unpack("H*",'tneC')`,%ebx
+       jne     .Lnoluck
+       cmp     \$`"0x".unpack("H*",'Hrua')`,%edx
+       jne     .Lnoluck
+       cmp     \$`"0x".unpack("H*",'slua')`,%ecx
+       jne     .Lnoluck
+       mov     \$0xC0000000,%eax
+       cpuid
+       mov     %eax,%edx
+       xor     %eax,%eax
+       cmp     \$0xC0000001,%edx
+       jb      .Lnoluck
+       mov     \$0xC0000001,%eax
+       cpuid
+       mov     %edx,%eax
+       and     \$0xffffffef,%eax
+       or      \$0x10,%eax             # set Nano bit#4
+.Lnoluck:
+       mov     %r8,%rbx
+       ret
+.size  padlock_capability,.-padlock_capability
+
+.globl padlock_key_bswap
+.type  padlock_key_bswap,address@hidden,0
+.align 16
+padlock_key_bswap:
+       mov     240($arg1),%edx
+.Lbswap_loop:
+       mov     ($arg1),%eax
+       bswap   %eax
+       mov     %eax,($arg1)
+       lea     4($arg1),$arg1
+       sub     \$1,%edx
+       jnz     .Lbswap_loop
+       ret
+.size  padlock_key_bswap,.-padlock_key_bswap
+
+.globl padlock_verify_context
+.type  padlock_verify_context,address@hidden
+.align 16
+padlock_verify_context:
+       mov     $arg1,$ctx
+       pushf
+       lea     .Lpadlock_saved_context(%rip),%rax
+       call    _padlock_verify_ctx
+       lea     8(%rsp),%rsp
+       ret
+.size  padlock_verify_context,.-padlock_verify_context
+
+.type  _padlock_verify_ctx,address@hidden
+.align 16
+_padlock_verify_ctx:
+       mov     8(%rsp),%r8
+       bt      \$30,%r8
+       jnc     .Lverified
+       cmp     (%rax),$ctx
+       je      .Lverified
+       pushf
+       popf
+.Lverified:
+       mov     $ctx,(%rax)
+       ret
+.size  _padlock_verify_ctx,.-_padlock_verify_ctx
+
+.globl padlock_reload_key
+.type  padlock_reload_key,address@hidden
+.align 16
+padlock_reload_key:
+       pushf
+       popf
+       ret
+.size  padlock_reload_key,.-padlock_reload_key
+
+.globl padlock_aes_block
+.type  padlock_aes_block,address@hidden,3
+.align 16
+padlock_aes_block:
+       mov     %rbx,%r8
+       mov     \$1,$len
+       lea     32($ctx),%rbx           # key
+       lea     16($ctx),$ctx           # control word
+       .byte   0xf3,0x0f,0xa7,0xc8     # rep xcryptecb
+       mov     %r8,%rbx
+       ret
+.size  padlock_aes_block,.-padlock_aes_block
+
+.globl padlock_xstore
+.type  padlock_xstore,address@hidden,2
+.align 16
+padlock_xstore:
+       mov     %esi,%edx
+       .byte   0x0f,0xa7,0xc0          # xstore
+       ret
+.size  padlock_xstore,.-padlock_xstore
+
+.globl padlock_sha1_oneshot
+.type  padlock_sha1_oneshot,address@hidden,3
+.align 16
+padlock_sha1_oneshot:
+       mov     %rdx,%rcx
+       mov     %rdi,%rdx               # put aside %rdi
+       movups  (%rdi),%xmm0            # copy-in context
+       sub     \$128+8,%rsp
+       mov     16(%rdi),%eax
+       movaps  %xmm0,(%rsp)
+       mov     %rsp,%rdi
+       mov     %eax,16(%rsp)
+       xor     %rax,%rax
+       .byte   0xf3,0x0f,0xa6,0xc8     # rep xsha1
+       movaps  (%rsp),%xmm0
+       mov     16(%rsp),%eax
+       add     \$128+8,%rsp
+       movups  %xmm0,(%rdx)            # copy-out context
+       mov     %eax,16(%rdx)
+       ret
+.size  padlock_sha1_oneshot,.-padlock_sha1_oneshot
+
+.globl padlock_sha1_blocks
+.type  padlock_sha1_blocks,address@hidden,3
+.align 16
+padlock_sha1_blocks:
+       mov     %rdx,%rcx
+       mov     %rdi,%rdx               # put aside %rdi
+       movups  (%rdi),%xmm0            # copy-in context
+       sub     \$128+8,%rsp
+       mov     16(%rdi),%eax
+       movaps  %xmm0,(%rsp)
+       mov     %rsp,%rdi
+       mov     %eax,16(%rsp)
+       mov     \$-1,%rax
+       .byte   0xf3,0x0f,0xa6,0xc8     # rep xsha1
+       movaps  (%rsp),%xmm0
+       mov     16(%rsp),%eax
+       add     \$128+8,%rsp
+       movups  %xmm0,(%rdx)            # copy-out context
+       mov     %eax,16(%rdx)
+       ret
+.size  padlock_sha1_blocks,.-padlock_sha1_blocks
+
+.globl padlock_sha256_oneshot
+.type  padlock_sha256_oneshot,address@hidden,3
+.align 16
+padlock_sha256_oneshot:
+       mov     %rdx,%rcx
+       mov     %rdi,%rdx               # put aside %rdi
+       movups  (%rdi),%xmm0            # copy-in context
+       sub     \$128+8,%rsp
+       movups  16(%rdi),%xmm1
+       movaps  %xmm0,(%rsp)
+       mov     %rsp,%rdi
+       movaps  %xmm1,16(%rsp)
+       xor     %rax,%rax
+       .byte   0xf3,0x0f,0xa6,0xd0     # rep xsha256
+       movaps  (%rsp),%xmm0
+       movaps  16(%rsp),%xmm1
+       add     \$128+8,%rsp
+       movups  %xmm0,(%rdx)            # copy-out context
+       movups  %xmm1,16(%rdx)
+       ret
+.size  padlock_sha256_oneshot,.-padlock_sha256_oneshot
+
+.globl padlock_sha256_blocks
+.type  padlock_sha256_blocks,address@hidden,3
+.align 16
+padlock_sha256_blocks:
+       mov     %rdx,%rcx
+       mov     %rdi,%rdx               # put aside %rdi
+       movups  (%rdi),%xmm0            # copy-in context
+       sub     \$128+8,%rsp
+       movups  16(%rdi),%xmm1
+       movaps  %xmm0,(%rsp)
+       mov     %rsp,%rdi
+       movaps  %xmm1,16(%rsp)
+       mov     \$-1,%rax
+       .byte   0xf3,0x0f,0xa6,0xd0     # rep xsha256
+       movaps  (%rsp),%xmm0
+       movaps  16(%rsp),%xmm1
+       add     \$128+8,%rsp
+       movups  %xmm0,(%rdx)            # copy-out context
+       movups  %xmm1,16(%rdx)
+       ret
+.size  padlock_sha256_blocks,.-padlock_sha256_blocks
+
+.globl padlock_sha512_blocks
+.type  padlock_sha512_blocks,address@hidden,3
+.align 16
+padlock_sha512_blocks:
+       mov     %rdx,%rcx
+       mov     %rdi,%rdx               # put aside %rdi
+       movups  (%rdi),%xmm0            # copy-in context
+       sub     \$128+8,%rsp
+       movups  16(%rdi),%xmm1
+       movups  32(%rdi),%xmm2
+       movups  48(%rdi),%xmm3
+       movaps  %xmm0,(%rsp)
+       mov     %rsp,%rdi
+       movaps  %xmm1,16(%rsp)
+       movaps  %xmm2,32(%rsp)
+       movaps  %xmm3,48(%rsp)
+       .byte   0xf3,0x0f,0xa6,0xe0     # rep xha512
+       movaps  (%rsp),%xmm0
+       movaps  16(%rsp),%xmm1
+       movaps  32(%rsp),%xmm2
+       movaps  48(%rsp),%xmm3
+       add     \$128+8,%rsp
+       movups  %xmm0,(%rdx)            # copy-out context
+       movups  %xmm1,16(%rdx)
+       movups  %xmm2,32(%rdx)
+       movups  %xmm3,48(%rdx)
+       ret
+.size  padlock_sha512_blocks,.-padlock_sha512_blocks
+___
+
+sub generate_mode {
+my ($mode,$opcode) = @_;
+# int padlock_$mode_encrypt(void *out, const void *inp,
+#              struct padlock_cipher_data *ctx, size_t len);
+$code.=<<___;
+.globl padlock_${mode}_encrypt
+.type  padlock_${mode}_encrypt,address@hidden,4
+.align 16
+padlock_${mode}_encrypt:
+       push    %rbp
+       push    %rbx
+
+       xor     %eax,%eax
+       test    \$15,$ctx
+       jnz     .L${mode}_abort
+       test    \$15,$len
+       jnz     .L${mode}_abort
+       lea     .Lpadlock_saved_context(%rip),%rax
+       pushf
+       cld
+       call    _padlock_verify_ctx
+       lea     16($ctx),$ctx           # control word
+       xor     %eax,%eax
+       xor     %ebx,%ebx
+___
+# Formally speaking correct condtion is $len<=$margin and $inp+$margin
+# crosses page boundary [and next page is unreadable]. But $inp can
+# be unaligned in which case data can be copied to $out if latter is
+# aligned, in which case $out+$margin has to be checked. Covering all
+# cases appears more complicated than just copying short input...
+$code.=<<___   if ($PADLOCK_MARGIN{$mode});
+       cmp     \$$PADLOCK_MARGIN{$mode},$len
+       jbe     .L${mode}_short
+___
+$code.=<<___;
+       testl   \$`1<<5`,($ctx)         # align bit in control word
+       jnz     .L${mode}_aligned
+       test    \$0x0f,$out
+       setz    %al                     # !out_misaligned
+       test    \$0x0f,$inp
+       setz    %bl                     # !inp_misaligned
+       test    %ebx,%eax
+       jnz     .L${mode}_aligned
+       neg     %rax
+       mov     \$$PADLOCK_CHUNK,$chunk
+       not     %rax                    # out_misaligned?-1:0
+       lea     (%rsp),%rbp
+       cmp     $chunk,$len
+       cmovc   $len,$chunk             # 
chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
+       and     $chunk,%rax             # out_misaligned?chunk:0
+       mov     $len,$chunk
+       neg     %rax
+       and     \$$PADLOCK_CHUNK-1,$chunk       # chunk%=PADLOCK_CHUNK
+       lea     (%rax,%rbp),%rsp
+___
+$code.=<<___                           if ($mode eq "ctr32");
+.L${mode}_reenter:
+       mov     -4($ctx),%eax           # pull 32-bit counter
+       bswap   %eax
+       neg     %eax
+       and     \$`$PADLOCK_CHUNK/16-1`,%eax
+       jz      .L${mode}_loop
+       shl     \$4,%eax
+       cmp     %rax,$len
+       cmova   %rax,$chunk             # don't let counter cross PADLOCK_CHUNK
+___
+$code.=<<___;
+       jmp     .L${mode}_loop
+.align 16
+.L${mode}_loop:
+       cmp     $len,$chunk             # ctr32 artefact
+       cmova   $len,$chunk             # ctr32 artefact
+       mov     $out,%r8                # save parameters
+       mov     $inp,%r9
+       mov     $len,%r10
+       mov     $chunk,$len
+       mov     $chunk,%r11
+       test    \$0x0f,$out             # out_misaligned
+       cmovnz  %rsp,$out
+       test    \$0x0f,$inp             # inp_misaligned
+       jz      .L${mode}_inp_aligned
+       shr     \$3,$len
+       .byte   0xf3,0x48,0xa5          # rep movsq
+       sub     $chunk,$out
+       mov     $chunk,$len
+       mov     $out,$inp
+.L${mode}_inp_aligned:
+       lea     -16($ctx),%rax          # ivp
+       lea     16($ctx),%rbx           # key
+       shr     \$4,$len
+       .byte   0xf3,0x0f,0xa7,$opcode  # rep xcrypt*
+___
+$code.=<<___                           if ($mode !~ /ecb|ctr/);
+       movdqa  (%rax),%xmm0
+       movdqa  %xmm0,-16($ctx)         # copy [or refresh] iv
+___
+$code.=<<___                           if ($mode eq "ctr32");
+       mov     -4($ctx),%eax           # pull 32-bit counter
+       test    \$0xffff0000,%eax
+       jnz     .L${mode}_no_corr
+       bswap   %eax
+       add     \$0x10000,%eax
+       bswap   %eax
+       mov     %eax,-4($ctx)
+.L${mode}_no_corr:
+___
+$code.=<<___;
+       mov     %r8,$out                # restore paramters
+       mov     %r11,$chunk
+       test    \$0x0f,$out
+       jz      .L${mode}_out_aligned
+       mov     $chunk,$len
+       shr     \$3,$len
+       lea     (%rsp),$inp
+       .byte   0xf3,0x48,0xa5          # rep movsq
+       sub     $chunk,$out
+.L${mode}_out_aligned:
+       mov     %r9,$inp
+       mov     %r10,$len
+       add     $chunk,$out
+       add     $chunk,$inp
+       sub     $chunk,$len
+       mov     \$$PADLOCK_CHUNK,$chunk
+       jnz     .L${mode}_loop
+
+       cmp     %rsp,%rbp
+       je      .L${mode}_done
+
+       pxor    %xmm0,%xmm0
+       lea     (%rsp),%rax
+.L${mode}_bzero:
+       movaps  %xmm0,(%rax)
+       lea     16(%rax),%rax
+       cmp     %rax,%rbp
+       ja      .L${mode}_bzero
+
+.L${mode}_done:
+       lea     (%rbp),%rsp
+       jmp     .L${mode}_exit
+___
+$code.=<<___ if ($PADLOCK_MARGIN{$mode});
+.align 16
+.L${mode}_short:
+       mov     %rsp,%rbp
+       sub     $len,%rsp
+       xor     $chunk,$chunk
+.L${mode}_short_copy:
+       movups  ($inp,$chunk),%xmm0
+       lea     16($chunk),$chunk
+       cmp     $chunk,$len
+       movaps  %xmm0,-16(%rsp,$chunk)
+       ja      .L${mode}_short_copy
+       mov     %rsp,$inp
+       mov     $len,$chunk
+       jmp     .L${mode}_`${mode} eq "ctr32"?"reenter":"loop"`
+___
+$code.=<<___;
+.align 16
+.L${mode}_aligned:
+___
+$code.=<<___                           if ($mode eq "ctr32");
+       mov     -4($ctx),%eax           # pull 32-bit counter
+       mov     \$`16*0x10000`,$chunk
+       bswap   %eax
+       cmp     $len,$chunk
+       cmova   $len,$chunk
+       neg     %eax
+       and     \$0xffff,%eax
+       jz      .L${mode}_aligned_loop
+       shl     \$4,%eax
+       cmp     %rax,$len
+       cmova   %rax,$chunk             # don't let counter cross 2^16
+       jmp     .L${mode}_aligned_loop
+.align 16
+.L${mode}_aligned_loop:
+       cmp     $len,$chunk
+       cmova   $len,$chunk
+       mov     $len,%r10               # save parameters
+       mov     $chunk,$len
+       mov     $chunk,%r11
+___
+$code.=<<___;
+       lea     -16($ctx),%rax          # ivp
+       lea     16($ctx),%rbx           # key
+       shr     \$4,$len                # len/=AES_BLOCK_SIZE
+       .byte   0xf3,0x0f,0xa7,$opcode  # rep xcrypt*
+___
+$code.=<<___                           if ($mode !~ /ecb|ctr/);
+       movdqa  (%rax),%xmm0
+       movdqa  %xmm0,-16($ctx)         # copy [or refresh] iv
+___
+$code.=<<___                           if ($mode eq "ctr32");
+       mov     -4($ctx),%eax           # pull 32-bit counter
+       bswap   %eax
+       add     \$0x10000,%eax
+       bswap   %eax
+       mov     %eax,-4($ctx)
+
+       mov     %r11,$chunk             # restore paramters
+       mov     %r10,$len
+       sub     $chunk,$len
+       mov     \$`16*0x10000`,$chunk
+       jnz     .L${mode}_aligned_loop
+___
+$code.=<<___;
+.L${mode}_exit:
+       mov     \$1,%eax
+       lea     8(%rsp),%rsp
+.L${mode}_abort:
+       pop     %rbx
+       pop     %rbp
+       ret
+.size  padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
+___
+}
+
+&generate_mode("ecb",0xc8);
+&generate_mode("cbc",0xd0);
+#&generate_mode("cfb",0xe0);
+#&generate_mode("ofb",0xe8);
+#&generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
+
+$code.=<<___;
+.asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <address@hidden>"
+.align 16
+.data
+.align 8
+.Lpadlock_saved_context:
+       .quad   0
+___
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/devel/perlasm/ghash-x86.pl b/devel/perlasm/ghash-x86.pl
new file mode 100644
index 0000000..1b9adfb
--- /dev/null
+++ b/devel/perlasm/ghash-x86.pl
@@ -0,0 +1,1342 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <address@hidden> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, May, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
+# code paths: vanilla x86 and vanilla MMX. Former will be executed on
+# 486 and Pentium, latter on all others. MMX GHASH features so called
+# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
+# of per-key storage [+512 bytes shared table]. Performance results
+# are for streamed GHASH subroutine and are expressed in cycles per
+# processed byte, less is better:
+#
+#              gcc 2.95.3(*)   MMX assembler   x86 assembler
+#
+# Pentium      105/111(**)     -               50
+# PIII         68 /75          12.2            24
+# P4           125/125         17.8            84(***)
+# Opteron      66 /70          10.1            30
+# Core2                54 /67          8.4             18
+#
+# (*)  gcc 3.4.x was observed to generate few percent slower code,
+#      which is one of reasons why 2.95.3 results were chosen,
+#      another reason is lack of 3.4.x results for older CPUs;
+#      comparison with MMX results is not completely fair, because C
+#      results are for vanilla "256B" implementation, while
+#      assembler results are for "528B";-)
+# (**) second number is result for code compiled with -fPIC flag,
+#      which is actually more relevant, because assembler code is
+#      position-independent;
+# (***)        see comment in non-MMX routine for further details;
+#
+# To summarize, it's >2-5 times faster than gcc-generated code. To
+# anchor it to something else SHA1 assembler processes one byte in
+# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
+# particular, see comment at the end of the file...
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
+# The question is how close is it to theoretical limit? The pclmulqdq
+# instruction latency appears to be 14 cycles and there can't be more
+# than 2 of them executing at any given time. This means that single
+# Karatsuba multiplication would take 28 cycles *plus* few cycles for
+# pre- and post-processing. Then multiplication has to be followed by
+# modulo-reduction. Given that aggregated reduction method [see
+# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
+# white paper by Intel] allows you to perform reduction only once in
+# a while we can assume that asymptotic performance can be estimated
+# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
+# and Naggr is the aggregation factor.
+#
+# Before we proceed to this implementation let's have closer look at
+# the best-performing code suggested by Intel in their white paper.
+# By tracing inter-register dependencies Tmod is estimated as ~19
+# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
+# processed byte. As implied, this is quite optimistic estimate,
+# because it does not account for Karatsuba pre- and post-processing,
+# which for a single multiplication is ~5 cycles. Unfortunately Intel
+# does not provide performance data for GHASH alone. But benchmarking
+# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
+# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
+# the result accounts even for pre-computing of degrees of the hash
+# key H, but its portion is negligible at 16KB buffer size.
+#
+# Moving on to the implementation in question. Tmod is estimated as
+# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
+# 2.16. How is it possible that measured performance is better than
+# optimistic theoretical estimate? There is one thing Intel failed
+# to recognize. By serializing GHASH with CTR in same subroutine
+# former's performance is really limited to above (Tmul + Tmod/Naggr)
+# equation. But if GHASH procedure is detached, the modulo-reduction
+# can be interleaved with Naggr-1 multiplications at instruction level
+# and under ideal conditions even disappear from the equation. So that
+# optimistic theoretical estimate for this implementation is ...
+# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
+# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
+# where Tproc is time required for Karatsuba pre- and post-processing,
+# is more realistic estimate. In this case it gives ... 1.91 cycles.
+# Or in other words, depending on how well we can interleave reduction
+# and one of the two multiplications the performance should be betwen
+# 1.91 and 2.16. As already mentioned, this implementation processes
+# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
+# - in 2.02. x86_64 performance is better, because larger register
+# bank allows to interleave reduction and multiplication better.
+#
+# Does it make sense to increase Naggr? To start with it's virtually
+# impossible in 32-bit mode, because of limited register bank
+# capacity. Otherwise improvement has to be weighed agiainst slower
+# setup, as well as code size and complexity increase. As even
+# optimistic estimate doesn't promise 30% performance improvement,
+# there are currently no plans to increase Naggr.
+#
+# Special thanks to David Woodhouse <address@hidden> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+
+# January 2010
+#
+# Tweaked to optimize transitions between integer and FP operations
+# on same XMM register, PCLMULQDQ subroutine was measured to process
+# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
+# The minor regression on Westmere is outweighed by ~15% improvement
+# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
+# similar manner resulted in almost 20% degradation on Sandy Bridge,
+# where original 64-bit code processes one byte in 1.95 cycles.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
+$inp  = "edi";
+$Htbl = "esi";
+
+$unroll = 0;   # Affects x86 loop. Folded loop performs ~7% worse
+               # than unrolled, which has to be weighted against
+               # 2.5x x86-specific code size reduction.
+
+sub x86_loop {
+    my $off = shift;
+    my $rem = "eax";
+
+       &mov    ($Zhh,&DWP(4,$Htbl,$Zll));
+       &mov    ($Zhl,&DWP(0,$Htbl,$Zll));
+       &mov    ($Zlh,&DWP(12,$Htbl,$Zll));
+       &mov    ($Zll,&DWP(8,$Htbl,$Zll));
+       &xor    ($rem,$rem);    # avoid partial register stalls on PIII
+
+       # shrd practically kills P4, 2.5x deterioration, but P4 has
+       # MMX code-path to execute. shrd runs tad faster [than twice
+       # the shifts, move's and or's] on pre-MMX Pentium (as well as
+       # PIII and Core2), *but* minimizes code size, spares register
+       # and thus allows to fold the loop...
+       if (!$unroll) {
+       my $cnt = $inp;
+       &mov    ($cnt,15);
+       &jmp    (&label("x86_loop"));
+       &set_label("x86_loop",16);
+           for($i=1;$i<=2;$i++) {
+               &mov    (&LB($rem),&LB($Zll));
+               &shrd   ($Zll,$Zlh,4);
+               &and    (&LB($rem),0xf);
+               &shrd   ($Zlh,$Zhl,4);
+               &shrd   ($Zhl,$Zhh,4);
+               &shr    ($Zhh,4);
+               &xor    ($Zhh,&DWP($off+16,"esp",$rem,4));
+
+               &mov    (&LB($rem),&BP($off,"esp",$cnt));
+               if ($i&1) {
+                       &and    (&LB($rem),0xf0);
+               } else {
+                       &shl    (&LB($rem),4);
+               }
+
+               &xor    ($Zll,&DWP(8,$Htbl,$rem));
+               &xor    ($Zlh,&DWP(12,$Htbl,$rem));
+               &xor    ($Zhl,&DWP(0,$Htbl,$rem));
+               &xor    ($Zhh,&DWP(4,$Htbl,$rem));
+
+               if ($i&1) {
+                       &dec    ($cnt);
+                       &js     (&label("x86_break"));
+               } else {
+                       &jmp    (&label("x86_loop"));
+               }
+           }
+       &set_label("x86_break",16);
+       } else {
+           for($i=1;$i<32;$i++) {
+               &comment($i);
+               &mov    (&LB($rem),&LB($Zll));
+               &shrd   ($Zll,$Zlh,4);
+               &and    (&LB($rem),0xf);
+               &shrd   ($Zlh,$Zhl,4);
+               &shrd   ($Zhl,$Zhh,4);
+               &shr    ($Zhh,4);
+               &xor    ($Zhh,&DWP($off+16,"esp",$rem,4));
+
+               if ($i&1) {
+                       &mov    (&LB($rem),&BP($off+15-($i>>1),"esp"));
+                       &and    (&LB($rem),0xf0);
+               } else {
+                       &mov    (&LB($rem),&BP($off+15-($i>>1),"esp"));
+                       &shl    (&LB($rem),4);
+               }
+
+               &xor    ($Zll,&DWP(8,$Htbl,$rem));
+               &xor    ($Zlh,&DWP(12,$Htbl,$rem));
+               &xor    ($Zhl,&DWP(0,$Htbl,$rem));
+               &xor    ($Zhh,&DWP(4,$Htbl,$rem));
+           }
+       }
+       &bswap  ($Zll);
+       &bswap  ($Zlh);
+       &bswap  ($Zhl);
+       if (!$x86only) {
+               &bswap  ($Zhh);
+       } else {
+               &mov    ("eax",$Zhh);
+               &bswap  ("eax");
+               &mov    ($Zhh,"eax");
+       }
+}
+
+if ($unroll) {
+    &function_begin_B("_x86_gmult_4bit_inner");
+       &x86_loop(4);
+       &ret    ();
+    &function_end_B("_x86_gmult_4bit_inner");
+}
+
+sub deposit_rem_4bit {
+    my $bias = shift;
+
+       &mov    (&DWP($bias+0, "esp"),0x0000<<16);
+       &mov    (&DWP($bias+4, "esp"),0x1C20<<16);
+       &mov    (&DWP($bias+8, "esp"),0x3840<<16);
+       &mov    (&DWP($bias+12,"esp"),0x2460<<16);
+       &mov    (&DWP($bias+16,"esp"),0x7080<<16);
+       &mov    (&DWP($bias+20,"esp"),0x6CA0<<16);
+       &mov    (&DWP($bias+24,"esp"),0x48C0<<16);
+       &mov    (&DWP($bias+28,"esp"),0x54E0<<16);
+       &mov    (&DWP($bias+32,"esp"),0xE100<<16);
+       &mov    (&DWP($bias+36,"esp"),0xFD20<<16);
+       &mov    (&DWP($bias+40,"esp"),0xD940<<16);
+       &mov    (&DWP($bias+44,"esp"),0xC560<<16);
+       &mov    (&DWP($bias+48,"esp"),0x9180<<16);
+       &mov    (&DWP($bias+52,"esp"),0x8DA0<<16);
+       &mov    (&DWP($bias+56,"esp"),0xA9C0<<16);
+       &mov    (&DWP($bias+60,"esp"),0xB5E0<<16);
+}
+
+$suffix = $x86only ? "" : "_x86";
+
+&function_begin("gcm_gmult_4bit".$suffix);
+       &stack_push(16+4+1);                    # +1 for stack alignment
+       &mov    ($inp,&wparam(0));              # load Xi
+       &mov    ($Htbl,&wparam(1));             # load Htable
+
+       &mov    ($Zhh,&DWP(0,$inp));            # load Xi[16]
+       &mov    ($Zhl,&DWP(4,$inp));
+       &mov    ($Zlh,&DWP(8,$inp));
+       &mov    ($Zll,&DWP(12,$inp));
+
+       &deposit_rem_4bit(16);
+
+       &mov    (&DWP(0,"esp"),$Zhh);           # copy Xi[16] on stack
+       &mov    (&DWP(4,"esp"),$Zhl);
+       &mov    (&DWP(8,"esp"),$Zlh);
+       &mov    (&DWP(12,"esp"),$Zll);
+       &shr    ($Zll,20);
+       &and    ($Zll,0xf0);
+
+       if ($unroll) {
+               &call   ("_x86_gmult_4bit_inner");
+       } else {
+               &x86_loop(0);
+               &mov    ($inp,&wparam(0));
+       }
+
+       &mov    (&DWP(12,$inp),$Zll);
+       &mov    (&DWP(8,$inp),$Zlh);
+       &mov    (&DWP(4,$inp),$Zhl);
+       &mov    (&DWP(0,$inp),$Zhh);
+       &stack_pop(16+4+1);
+&function_end("gcm_gmult_4bit".$suffix);
+
+&function_begin("gcm_ghash_4bit".$suffix);
+       &stack_push(16+4+1);                    # +1 for 64-bit alignment
+       &mov    ($Zll,&wparam(0));              # load Xi
+       &mov    ($Htbl,&wparam(1));             # load Htable
+       &mov    ($inp,&wparam(2));              # load in
+       &mov    ("ecx",&wparam(3));             # load len
+       &add    ("ecx",$inp);
+       &mov    (&wparam(3),"ecx");
+
+       &mov    ($Zhh,&DWP(0,$Zll));            # load Xi[16]
+       &mov    ($Zhl,&DWP(4,$Zll));
+       &mov    ($Zlh,&DWP(8,$Zll));
+       &mov    ($Zll,&DWP(12,$Zll));
+
+       &deposit_rem_4bit(16);
+
+    &set_label("x86_outer_loop",16);
+       &xor    ($Zll,&DWP(12,$inp));           # xor with input
+       &xor    ($Zlh,&DWP(8,$inp));
+       &xor    ($Zhl,&DWP(4,$inp));
+       &xor    ($Zhh,&DWP(0,$inp));
+       &mov    (&DWP(12,"esp"),$Zll);          # dump it on stack
+       &mov    (&DWP(8,"esp"),$Zlh);
+       &mov    (&DWP(4,"esp"),$Zhl);
+       &mov    (&DWP(0,"esp"),$Zhh);
+
+       &shr    ($Zll,20);
+       &and    ($Zll,0xf0);
+
+       if ($unroll) {
+               &call   ("_x86_gmult_4bit_inner");
+       } else {
+               &x86_loop(0);
+               &mov    ($inp,&wparam(2));
+       }
+       &lea    ($inp,&DWP(16,$inp));
+       &cmp    ($inp,&wparam(3));
+       &mov    (&wparam(2),$inp)       if (!$unroll);
+       &jb     (&label("x86_outer_loop"));
+
+       &mov    ($inp,&wparam(0));      # load Xi
+       &mov    (&DWP(12,$inp),$Zll);
+       &mov    (&DWP(8,$inp),$Zlh);
+       &mov    (&DWP(4,$inp),$Zhl);
+       &mov    (&DWP(0,$inp),$Zhh);
+       &stack_pop(16+4+1);
+&function_end("gcm_ghash_4bit".$suffix);
+
+if (!$x86only) {{{
+
+&static_label("rem_4bit");
+
+if (0) {{      # "May" MMX version is kept for reference...
+
+$S=12;         # shift factor for rem_4bit
+
+&function_begin_B("_mmx_gmult_4bit_inner");
+# MMX version performs 3.5 times better on P4 (see comment in non-MMX
+# routine for further details), 100% better on Opteron, ~70% better
+# on Core2 and PIII... In other words effort is considered to be well
+# spent... Since initial release the loop was unrolled in order to
+# "liberate" register previously used as loop counter. Instead it's
+# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
+# The path involves move of Z.lo from MMX to integer register,
+# effective address calculation and finally merge of value to Z.hi.
+# Reference to rem_4bit is scheduled so late that I had to >>4
+# rem_4bit elements. This resulted in 20-45% procent improvement
+# on contemporary µ-archs.
+{
+    my $cnt;
+    my $rem_4bit = "eax";
+    my @rem = ($Zhh,$Zll);
+    my $nhi = $Zhl;
+    my $nlo = $Zlh;
+
+    my ($Zlo,$Zhi) = ("mm0","mm1");
+    my $tmp = "mm2";
+
+       &xor    ($nlo,$nlo);    # avoid partial register stalls on PIII
+       &mov    ($nhi,$Zll);
+       &mov    (&LB($nlo),&LB($nhi));
+       &shl    (&LB($nlo),4);
+       &and    ($nhi,0xf0);
+       &movq   ($Zlo,&QWP(8,$Htbl,$nlo));
+       &movq   ($Zhi,&QWP(0,$Htbl,$nlo));
+       &movd   ($rem[0],$Zlo);
+
+       for ($cnt=28;$cnt>=-2;$cnt--) {
+           my $odd = $cnt&1;
+           my $nix = $odd ? $nlo : $nhi;
+
+               &shl    (&LB($nlo),4)                   if ($odd);
+               &psrlq  ($Zlo,4);
+               &movq   ($tmp,$Zhi);
+               &psrlq  ($Zhi,4);
+               &pxor   ($Zlo,&QWP(8,$Htbl,$nix));
+               &mov    (&LB($nlo),&BP($cnt/2,$inp))    if (!$odd && $cnt>=0);
+               &psllq  ($tmp,60);
+               &and    ($nhi,0xf0)                     if ($odd);
+               &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
+               &and    ($rem[0],0xf);
+               &pxor   ($Zhi,&QWP(0,$Htbl,$nix));
+               &mov    ($nhi,$nlo)                     if (!$odd && $cnt>=0);
+               &movd   ($rem[1],$Zlo);
+               &pxor   ($Zlo,$tmp);
+
+               push    (@rem,shift(@rem));             # "rotate" registers
+       }
+
+       &mov    ($inp,&DWP(4,$rem_4bit,$rem[1],8));     # last rem_4bit[rem]
+
+       &psrlq  ($Zlo,32);      # lower part of Zlo is already there
+       &movd   ($Zhl,$Zhi);
+       &psrlq  ($Zhi,32);
+       &movd   ($Zlh,$Zlo);
+       &movd   ($Zhh,$Zhi);
+       &shl    ($inp,4);       # compensate for rem_4bit[i] being >>4
+
+       &bswap  ($Zll);
+       &bswap  ($Zhl);
+       &bswap  ($Zlh);
+       &xor    ($Zhh,$inp);
+       &bswap  ($Zhh);
+
+       &ret    ();
+}
+&function_end_B("_mmx_gmult_4bit_inner");
+
+&function_begin("gcm_gmult_4bit_mmx");
+       &mov    ($inp,&wparam(0));      # load Xi
+       &mov    ($Htbl,&wparam(1));     # load Htable
+
+       &call   (&label("pic_point"));
+       &set_label("pic_point");
+       &blindpop("eax");
+       &lea    ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+       &movz   ($Zll,&BP(15,$inp));
+
+       &call   ("_mmx_gmult_4bit_inner");
+
+       &mov    ($inp,&wparam(0));      # load Xi
+       &emms   ();
+       &mov    (&DWP(12,$inp),$Zll);
+       &mov    (&DWP(4,$inp),$Zhl);
+       &mov    (&DWP(8,$inp),$Zlh);
+       &mov    (&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+
+# Streamed version performs 20% better on P4, 7% on Opteron,
+# 10% on Core2 and PIII...
+&function_begin("gcm_ghash_4bit_mmx");
+       &mov    ($Zhh,&wparam(0));      # load Xi
+       &mov    ($Htbl,&wparam(1));     # load Htable
+       &mov    ($inp,&wparam(2));      # load in
+       &mov    ($Zlh,&wparam(3));      # load len
+
+       &call   (&label("pic_point"));
+       &set_label("pic_point");
+       &blindpop("eax");
+       &lea    ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+       &add    ($Zlh,$inp);
+       &mov    (&wparam(3),$Zlh);      # len to point at the end of input
+       &stack_push(4+1);               # +1 for stack alignment
+
+       &mov    ($Zll,&DWP(12,$Zhh));   # load Xi[16]
+       &mov    ($Zhl,&DWP(4,$Zhh));
+       &mov    ($Zlh,&DWP(8,$Zhh));
+       &mov    ($Zhh,&DWP(0,$Zhh));
+       &jmp    (&label("mmx_outer_loop"));
+
+    &set_label("mmx_outer_loop",16);
+       &xor    ($Zll,&DWP(12,$inp));
+       &xor    ($Zhl,&DWP(4,$inp));
+       &xor    ($Zlh,&DWP(8,$inp));
+       &xor    ($Zhh,&DWP(0,$inp));
+       &mov    (&wparam(2),$inp);
+       &mov    (&DWP(12,"esp"),$Zll);
+       &mov    (&DWP(4,"esp"),$Zhl);
+       &mov    (&DWP(8,"esp"),$Zlh);
+       &mov    (&DWP(0,"esp"),$Zhh);
+
+       &mov    ($inp,"esp");
+       &shr    ($Zll,24);
+
+       &call   ("_mmx_gmult_4bit_inner");
+
+       &mov    ($inp,&wparam(2));
+       &lea    ($inp,&DWP(16,$inp));
+       &cmp    ($inp,&wparam(3));
+       &jb     (&label("mmx_outer_loop"));
+
+       &mov    ($inp,&wparam(0));      # load Xi
+       &emms   ();
+       &mov    (&DWP(12,$inp),$Zll);
+       &mov    (&DWP(4,$inp),$Zhl);
+       &mov    (&DWP(8,$inp),$Zlh);
+       &mov    (&DWP(0,$inp),$Zhh);
+
+       &stack_pop(4+1);
+&function_end("gcm_ghash_4bit_mmx");
+
+}} else {{     # "June" MMX version...
+               # ... has slower "April" gcm_gmult_4bit_mmx with folded
+               # loop. This is done to conserve code size...
+$S=16;         # shift factor for rem_4bit
+
+sub mmx_loop() {
+# MMX version performs 2.8 times better on P4 (see comment in non-MMX
+# routine for further details), 40% better on Opteron and Core2, 50%
+# better on PIII... In other words effort is considered to be well
+# spent...
+    my $inp = shift;
+    my $rem_4bit = shift;
+    my $cnt = $Zhh;
+    my $nhi = $Zhl;
+    my $nlo = $Zlh;
+    my $rem = $Zll;
+
+    my ($Zlo,$Zhi) = ("mm0","mm1");
+    my $tmp = "mm2";
+
+       &xor    ($nlo,$nlo);    # avoid partial register stalls on PIII
+       &mov    ($nhi,$Zll);
+       &mov    (&LB($nlo),&LB($nhi));
+       &mov    ($cnt,14);
+       &shl    (&LB($nlo),4);
+       &and    ($nhi,0xf0);
+       &movq   ($Zlo,&QWP(8,$Htbl,$nlo));
+       &movq   ($Zhi,&QWP(0,$Htbl,$nlo));
+       &movd   ($rem,$Zlo);
+       &jmp    (&label("mmx_loop"));
+
+    &set_label("mmx_loop",16);
+       &psrlq  ($Zlo,4);
+       &and    ($rem,0xf);
+       &movq   ($tmp,$Zhi);
+       &psrlq  ($Zhi,4);
+       &pxor   ($Zlo,&QWP(8,$Htbl,$nhi));
+       &mov    (&LB($nlo),&BP(0,$inp,$cnt));
+       &psllq  ($tmp,60);
+       &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+       &dec    ($cnt);
+       &movd   ($rem,$Zlo);
+       &pxor   ($Zhi,&QWP(0,$Htbl,$nhi));
+       &mov    ($nhi,$nlo);
+       &pxor   ($Zlo,$tmp);
+       &js     (&label("mmx_break"));
+
+       &shl    (&LB($nlo),4);
+       &and    ($rem,0xf);
+       &psrlq  ($Zlo,4);
+       &and    ($nhi,0xf0);
+       &movq   ($tmp,$Zhi);
+       &psrlq  ($Zhi,4);
+       &pxor   ($Zlo,&QWP(8,$Htbl,$nlo));
+       &psllq  ($tmp,60);
+       &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+       &movd   ($rem,$Zlo);
+       &pxor   ($Zhi,&QWP(0,$Htbl,$nlo));
+       &pxor   ($Zlo,$tmp);
+       &jmp    (&label("mmx_loop"));
+
+    &set_label("mmx_break",16);
+       &shl    (&LB($nlo),4);
+       &and    ($rem,0xf);
+       &psrlq  ($Zlo,4);
+       &and    ($nhi,0xf0);
+       &movq   ($tmp,$Zhi);
+       &psrlq  ($Zhi,4);
+       &pxor   ($Zlo,&QWP(8,$Htbl,$nlo));
+       &psllq  ($tmp,60);
+       &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+       &movd   ($rem,$Zlo);
+       &pxor   ($Zhi,&QWP(0,$Htbl,$nlo));
+       &pxor   ($Zlo,$tmp);
+
+       &psrlq  ($Zlo,4);
+       &and    ($rem,0xf);
+       &movq   ($tmp,$Zhi);
+       &psrlq  ($Zhi,4);
+       &pxor   ($Zlo,&QWP(8,$Htbl,$nhi));
+       &psllq  ($tmp,60);
+       &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+       &movd   ($rem,$Zlo);
+       &pxor   ($Zhi,&QWP(0,$Htbl,$nhi));
+       &pxor   ($Zlo,$tmp);
+
+       &psrlq  ($Zlo,32);      # lower part of Zlo is already there
+       &movd   ($Zhl,$Zhi);
+       &psrlq  ($Zhi,32);
+       &movd   ($Zlh,$Zlo);
+       &movd   ($Zhh,$Zhi);
+
+       &bswap  ($Zll);
+       &bswap  ($Zhl);
+       &bswap  ($Zlh);
+       &bswap  ($Zhh);
+}
+
+&function_begin("gcm_gmult_4bit_mmx");
+       &mov    ($inp,&wparam(0));      # load Xi
+       &mov    ($Htbl,&wparam(1));     # load Htable
+
+       &call   (&label("pic_point"));
+       &set_label("pic_point");
+       &blindpop("eax");
+       &lea    ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+       &movz   ($Zll,&BP(15,$inp));
+
+       &mmx_loop($inp,"eax");
+
+       &emms   ();
+       &mov    (&DWP(12,$inp),$Zll);
+       &mov    (&DWP(4,$inp),$Zhl);
+       &mov    (&DWP(8,$inp),$Zlh);
+       &mov    (&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+
+######################################################################
+# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
+# (see gcm128.c for details). It provides further 20-40% performance
+# improvement over above mentioned "May" version.
+
+&static_label("rem_8bit");
+
+&function_begin("gcm_ghash_4bit_mmx");
+{ my ($Zlo,$Zhi) = ("mm7","mm6");
+  my $rem_8bit = "esi";
+  my $Htbl = "ebx";
+
+    # parameter block
+    &mov       ("eax",&wparam(0));             # Xi
+    &mov       ("ebx",&wparam(1));             # Htable
+    &mov       ("ecx",&wparam(2));             # inp
+    &mov       ("edx",&wparam(3));             # len
+    &mov       ("ebp","esp");                  # original %esp
+    &call      (&label("pic_point"));
+    &set_label ("pic_point");
+    &blindpop  ($rem_8bit);
+    &lea       
($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit));
+
+    &sub       ("esp",512+16+16);              # allocate stack frame...
+    &and       ("esp",-64);                    # ...and align it
+    &sub       ("esp",16);                     # place for (u8)(H[]<<4)
+
+    &add       ("edx","ecx");                  # pointer to the end of input
+    &mov       (&DWP(528+16+0,"esp"),"eax");   # save Xi
+    &mov       (&DWP(528+16+8,"esp"),"edx");   # save inp+len
+    &mov       (&DWP(528+16+12,"esp"),"ebp");  # save original %esp
+
+    { my @lo  = ("mm0","mm1","mm2");
+      my @hi  = ("mm3","mm4","mm5");
+      my @tmp = ("mm6","mm7");
+      my $off1=0,$off2=0,$i;
+
+      &add     ($Htbl,128);                    # optimize for size
+      &lea     ("edi",&DWP(16+128,"esp"));
+      &lea     ("ebp",&DWP(16+256+128,"esp"));
+
+      # decompose Htable (low and high parts are kept separately),
+      # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
+      for ($i=0;$i<18;$i++) {
+
+       &mov    ("edx",&DWP(16*$i+8-128,$Htbl))         if ($i<16);
+       &movq   ($lo[0],&QWP(16*$i+8-128,$Htbl))        if ($i<16);
+       &psllq  ($tmp[1],60)                            if ($i>1);
+       &movq   ($hi[0],&QWP(16*$i+0-128,$Htbl))        if ($i<16);
+       &por    ($lo[2],$tmp[1])                        if ($i>1);
+       &movq   (&QWP($off1-128,"edi"),$lo[1])          if ($i>0 && $i<17);
+       &psrlq  ($lo[1],4)                              if ($i>0 && $i<17);
+       &movq   (&QWP($off1,"edi"),$hi[1])              if ($i>0 && $i<17);
+       &movq   ($tmp[0],$hi[1])                        if ($i>0 && $i<17);
+       &movq   (&QWP($off2-128,"ebp"),$lo[2])          if ($i>1);
+       &psrlq  ($hi[1],4)                              if ($i>0 && $i<17);
+       &movq   (&QWP($off2,"ebp"),$hi[2])              if ($i>1);
+       &shl    ("edx",4)                               if ($i<16);
+       &mov    (&BP($i,"esp"),&LB("edx"))              if ($i<16);
+
+       unshift (@lo,pop(@lo));                 # "rotate" registers
+       unshift (@hi,pop(@hi));
+       unshift (@tmp,pop(@tmp));
+       $off1 += 8      if ($i>0);
+       $off2 += 8      if ($i>1);
+      }
+    }
+
+    &movq      ($Zhi,&QWP(0,"eax"));
+    &mov       ("ebx",&DWP(8,"eax"));
+    &mov       ("edx",&DWP(12,"eax"));         # load Xi
+
+&set_label("outer",16);
+  { my $nlo = "eax";
+    my $dat = "edx";
+    my @nhi = ("edi","ebp");
+    my @rem = ("ebx","ecx");
+    my @red = ("mm0","mm1","mm2");
+    my $tmp = "mm3";
+
+    &xor       ($dat,&DWP(12,"ecx"));          # merge input data
+    &xor       ("ebx",&DWP(8,"ecx"));
+    &pxor      ($Zhi,&QWP(0,"ecx"));
+    &lea       ("ecx",&DWP(16,"ecx"));         # inp+=16
+    #&mov      (&DWP(528+12,"esp"),$dat);      # save inp^Xi
+    &mov       (&DWP(528+8,"esp"),"ebx");
+    &movq      (&QWP(528+0,"esp"),$Zhi);
+    &mov       (&DWP(528+16+4,"esp"),"ecx");   # save inp
+
+    &xor       ($nlo,$nlo);
+    &rol       ($dat,8);
+    &mov       (&LB($nlo),&LB($dat));
+    &mov       ($nhi[1],$nlo);
+    &and       (&LB($nlo),0x0f);
+    &shr       ($nhi[1],4);
+    &pxor      ($red[0],$red[0]);
+    &rol       ($dat,8);                       # next byte
+    &pxor      ($red[1],$red[1]);
+    &pxor      ($red[2],$red[2]);
+
+    # Just like in "May" verson modulo-schedule for critical path in
+    # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
+    # is scheduled so late that rem_8bit[] has to be shifted *right*
+    # by 16, which is why last argument to pinsrw is 2, which
+    # corresponds to <<32=<<48>>16...
+    for ($j=11,$i=0;$i<15;$i++) {
+
+      if ($i>0) {
+       &pxor   ($Zlo,&QWP(16,"esp",$nlo,8));           # Z^=H[nlo]
+       &rol    ($dat,8);                               # next byte
+       &pxor   ($Zhi,&QWP(16+128,"esp",$nlo,8));
+
+       &pxor   ($Zlo,$tmp);
+       &pxor   ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+       &xor    (&LB($rem[1]),&BP(0,"esp",$nhi[0]));    # rem^(H[nhi]<<4)
+      } else {
+       &movq   ($Zlo,&QWP(16,"esp",$nlo,8));
+       &movq   ($Zhi,&QWP(16+128,"esp",$nlo,8));
+      }
+
+       &mov    (&LB($nlo),&LB($dat));
+       &mov    ($dat,&DWP(528+$j,"esp"))               if (--$j%4==0);
+
+       &movd   ($rem[0],$Zlo);
+       &movz   ($rem[1],&LB($rem[1]))                  if ($i>0);
+       &psrlq  ($Zlo,8);                               # Z>>=8
+
+       &movq   ($tmp,$Zhi);
+       &mov    ($nhi[0],$nlo);
+       &psrlq  ($Zhi,8);
+
+       &pxor   ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8));  # Z^=H[nhi]>>4
+       &and    (&LB($nlo),0x0f);
+       &psllq  ($tmp,56);
+
+       &pxor   ($Zhi,$red[1])                          if ($i>1);
+       &shr    ($nhi[0],4);
+       &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2)  if ($i>0);
+
+       unshift (@red,pop(@red));                       # "rotate" registers
+       unshift (@rem,pop(@rem));
+       unshift (@nhi,pop(@nhi));
+    }
+
+    &pxor      ($Zlo,&QWP(16,"esp",$nlo,8));           # Z^=H[nlo]
+    &pxor      ($Zhi,&QWP(16+128,"esp",$nlo,8));
+    &xor       (&LB($rem[1]),&BP(0,"esp",$nhi[0]));    # rem^(H[nhi]<<4)
+
+    &pxor      ($Zlo,$tmp);
+    &pxor      ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+    &movz      ($rem[1],&LB($rem[1]));
+
+    &pxor      ($red[2],$red[2]);                      # clear 2nd word
+    &psllq     ($red[1],4);
+
+    &movd      ($rem[0],$Zlo);
+    &psrlq     ($Zlo,4);                               # Z>>=4
+
+    &movq      ($tmp,$Zhi);
+    &psrlq     ($Zhi,4);
+    &shl       ($rem[0],4);                            # rem<<4
+
+    &pxor      ($Zlo,&QWP(16,"esp",$nhi[1],8));        # Z^=H[nhi]
+    &psllq     ($tmp,60);
+    &movz      ($rem[0],&LB($rem[0]));
+
+    &pxor      ($Zlo,$tmp);
+    &pxor      ($Zhi,&QWP(16+128,"esp",$nhi[1],8));
+
+    &pinsrw    ($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
+    &pxor      ($Zhi,$red[1]);
+
+    &movd      ($dat,$Zlo);
+    &pinsrw    ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48
+
+    &psllq     ($red[0],12);                           # correct by <<16>>4
+    &pxor      ($Zhi,$red[0]);
+    &psrlq     ($Zlo,32);
+    &pxor      ($Zhi,$red[2]);
+
+    &mov       ("ecx",&DWP(528+16+4,"esp"));   # restore inp
+    &movd      ("ebx",$Zlo);
+    &movq      ($tmp,$Zhi);                    # 01234567
+    &psllw     ($Zhi,8);                       # 1.3.5.7.
+    &psrlw     ($tmp,8);                       # .0.2.4.6
+    &por       ($Zhi,$tmp);                    # 10325476
+    &bswap     ($dat);
+    &pshufw    ($Zhi,$Zhi,0b00011011);         # 76543210
+    &bswap     ("ebx");
+    
+    &cmp       ("ecx",&DWP(528+16+8,"esp"));   # are we done?
+    &jne       (&label("outer"));
+  }
+
+    &mov       ("eax",&DWP(528+16+0,"esp"));   # restore Xi
+    &mov       (&DWP(12,"eax"),"edx");
+    &mov       (&DWP(8,"eax"),"ebx");
+    &movq      (&QWP(0,"eax"),$Zhi);
+
+    &mov       ("esp",&DWP(528+16+12,"esp"));  # restore original %esp
+    &emms      ();
+}
+&function_end("gcm_ghash_4bit_mmx");
+}}
+
+if ($sse2) {{
+######################################################################
+# PCLMULQDQ version.
+
+$Xip="eax";
+$Htbl="edx";
+$const="ecx";
+$inp="esi";
+$len="ebx";
+
+($Xi,$Xhi)=("xmm0","xmm1");    $Hkey="xmm2";
+($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
+($Xn,$Xhn)=("xmm6","xmm7");
+
+&static_label("bswap");
+
+sub clmul64x64_T2 {    # minimal "register" pressure
+my ($Xhi,$Xi,$Hkey)address@hidden;
+
+       &movdqa         ($Xhi,$Xi);             #
+       &pshufd         ($T1,$Xi,0b01001110);
+       &pshufd         ($T2,$Hkey,0b01001110);
+       &pxor           ($T1,$Xi);              #
+       &pxor           ($T2,$Hkey);
+
+       &pclmulqdq      ($Xi,$Hkey,0x00);       #######
+       &pclmulqdq      ($Xhi,$Hkey,0x11);      #######
+       &pclmulqdq      ($T1,$T2,0x00);         #######
+       &xorps          ($T1,$Xi);              #
+       &xorps          ($T1,$Xhi);             #
+
+       &movdqa         ($T2,$T1);              #
+       &psrldq         ($T1,8);
+       &pslldq         ($T2,8);                #
+       &pxor           ($Xhi,$T1);
+       &pxor           ($Xi,$T2);              #
+}
+
+sub clmul64x64_T3 {
+# Even though this subroutine offers visually better ILP, it
+# was empirically found to be a tad slower than above version.
+# At least in gcm_ghash_clmul context. But it's just as well,
+# because loop modulo-scheduling is possible only thanks to
+# minimized "register" pressure...
+my ($Xhi,$Xi,$Hkey)address@hidden;
+
+       &movdqa         ($T1,$Xi);              #
+       &movdqa         ($Xhi,$Xi);
+       &pclmulqdq      ($Xi,$Hkey,0x00);       #######
+       &pclmulqdq      ($Xhi,$Hkey,0x11);      #######
+       &pshufd         ($T2,$T1,0b01001110);   #
+       &pshufd         ($T3,$Hkey,0b01001110);
+       &pxor           ($T2,$T1);              #
+       &pxor           ($T3,$Hkey);
+       &pclmulqdq      ($T2,$T3,0x00);         #######
+       &pxor           ($T2,$Xi);              #
+       &pxor           ($T2,$Xhi);             #
+
+       &movdqa         ($T3,$T2);              #
+       &psrldq         ($T2,8);
+       &pslldq         ($T3,8);                #
+       &pxor           ($Xhi,$T2);
+       &pxor           ($Xi,$T3);              #
+}
+
+if (1) {               # Algorithm 9 with <<1 twist.
+                       # Reduction is shorter and uses only two
+                       # temporary registers, which makes it better
+                       # candidate for interleaving with 64x64
+                       # multiplication. Pre-modulo-scheduled loop
+                       # was found to be ~20% faster than Algorithm 5
+                       # below. Algorithm 9 was therefore chosen for
+                       # further optimization...
+
+sub reduction_alg9 {   # 17/13 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+       # 1st phase
+       &movdqa         ($T1,$Xi)               #
+       &psllq          ($Xi,1);
+       &pxor           ($Xi,$T1);              #
+       &psllq          ($Xi,5);                #
+       &pxor           ($Xi,$T1);              #
+       &psllq          ($Xi,57);               #
+       &movdqa         ($T2,$Xi);              #
+       &pslldq         ($Xi,8);
+       &psrldq         ($T2,8);                #
+       &pxor           ($Xi,$T1);
+       &pxor           ($Xhi,$T2);             #
+
+       # 2nd phase
+       &movdqa         ($T2,$Xi);
+       &psrlq          ($Xi,5);
+       &pxor           ($Xi,$T2);              #
+       &psrlq          ($Xi,1);                #
+       &pxor           ($Xi,$T2);              #
+       &pxor           ($T2,$Xhi);
+       &psrlq          ($Xi,1);                #
+       &pxor           ($Xi,$T2);              #
+}
+
+&function_begin_B("gcm_init_clmul");
+       &mov            ($Htbl,&wparam(0));
+       &mov            ($Xip,&wparam(1));
+
+       &call           (&label("pic"));
+&set_label("pic");
+       &blindpop       ($const);
+       &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+       &movdqu         ($Hkey,&QWP(0,$Xip));
+       &pshufd         ($Hkey,$Hkey,0b01001110);# dword swap
+
+       # <<1 twist
+       &pshufd         ($T2,$Hkey,0b11111111); # broadcast uppermost dword
+       &movdqa         ($T1,$Hkey);
+       &psllq          ($Hkey,1);
+       &pxor           ($T3,$T3);              #
+       &psrlq          ($T1,63);
+       &pcmpgtd        ($T3,$T2);              # broadcast carry bit
+       &pslldq         ($T1,8);
+       &por            ($Hkey,$T1);            # H<<=1
+
+       # magic reduction
+       &pand           ($T3,&QWP(16,$const));  # 0x1c2_polynomial
+       &pxor           ($Hkey,$T3);            # if(carry) H^=0x1c2_polynomial
+
+       # calculate H^2
+       &movdqa         ($Xi,$Hkey);
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
+       &reduction_alg9 ($Xhi,$Xi);
+
+       &movdqu         (&QWP(0,$Htbl),$Hkey);  # save H
+       &movdqu         (&QWP(16,$Htbl),$Xi);   # save H^2
+
+       &ret            ();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+       &mov            ($Xip,&wparam(0));
+       &mov            ($Htbl,&wparam(1));
+
+       &call           (&label("pic"));
+&set_label("pic");
+       &blindpop       ($const);
+       &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+       &movdqu         ($Xi,&QWP(0,$Xip));
+       &movdqa         ($T3,&QWP(0,$const));
+       &movups         ($Hkey,&QWP(0,$Htbl));
+       &pshufb         ($Xi,$T3);
+
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
+       &reduction_alg9 ($Xhi,$Xi);
+
+       &pshufb         ($Xi,$T3);
+       &movdqu         (&QWP(0,$Xip),$Xi);
+
+       &ret    ();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+       &mov            ($Xip,&wparam(0));
+       &mov            ($Htbl,&wparam(1));
+       &mov            ($inp,&wparam(2));
+       &mov            ($len,&wparam(3));
+
+       &call           (&label("pic"));
+&set_label("pic");
+       &blindpop       ($const);
+       &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+       &movdqu         ($Xi,&QWP(0,$Xip));
+       &movdqa         ($T3,&QWP(0,$const));
+       &movdqu         ($Hkey,&QWP(0,$Htbl));
+       &pshufb         ($Xi,$T3);
+
+       &sub            ($len,0x10);
+       &jz             (&label("odd_tail"));
+
+       #######
+       # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+       #       [(H*Ii+1) + (H*Xi+1)] mod P =
+       #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
+       #
+       &movdqu         ($T1,&QWP(0,$inp));     # Ii
+       &movdqu         ($Xn,&QWP(16,$inp));    # Ii+1
+       &pshufb         ($T1,$T3);
+       &pshufb         ($Xn,$T3);
+       &pxor           ($Xi,$T1);              # Ii+Xi
+
+       &clmul64x64_T2  ($Xhn,$Xn,$Hkey);       # H*Ii+1
+       &movups         ($Hkey,&QWP(16,$Htbl)); # load H^2
+
+       &lea            ($inp,&DWP(32,$inp));   # i+=2
+       &sub            ($len,0x20);
+       &jbe            (&label("even_tail"));
+
+&set_label("mod_loop");
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H^2*(Ii+Xi)
+       &movdqu         ($T1,&QWP(0,$inp));     # Ii
+       &movups         ($Hkey,&QWP(0,$Htbl));  # load H
+
+       &pxor           ($Xi,$Xn);              # (H*Ii+1) + H^2*(Ii+Xi)
+       &pxor           ($Xhi,$Xhn);
+
+       &movdqu         ($Xn,&QWP(16,$inp));    # Ii+1
+       &pshufb         ($T1,$T3);
+       &pshufb         ($Xn,$T3);
+
+       &movdqa         ($T3,$Xn);              #&clmul64x64_TX 
($Xhn,$Xn,$Hkey); H*Ii+1
+       &movdqa         ($Xhn,$Xn);
+        &pxor          ($Xhi,$T1);             # "Ii+Xi", consume early
+
+         &movdqa       ($T1,$Xi)               #&reduction_alg9($Xhi,$Xi); 1st 
phase
+         &psllq        ($Xi,1);
+         &pxor         ($Xi,$T1);              #
+         &psllq        ($Xi,5);                #
+         &pxor         ($Xi,$T1);              #
+       &pclmulqdq      ($Xn,$Hkey,0x00);       #######
+         &psllq        ($Xi,57);               #
+         &movdqa       ($T2,$Xi);              #
+         &pslldq       ($Xi,8);
+         &psrldq       ($T2,8);                #       
+         &pxor         ($Xi,$T1);
+       &pshufd         ($T1,$T3,0b01001110);
+         &pxor         ($Xhi,$T2);             #
+       &pxor           ($T1,$T3);
+       &pshufd         ($T3,$Hkey,0b01001110);
+       &pxor           ($T3,$Hkey);            #
+
+       &pclmulqdq      ($Xhn,$Hkey,0x11);      #######
+         &movdqa       ($T2,$Xi);              # 2nd phase
+         &psrlq        ($Xi,5);
+         &pxor         ($Xi,$T2);              #
+         &psrlq        ($Xi,1);                #
+         &pxor         ($Xi,$T2);              #
+         &pxor         ($T2,$Xhi);
+         &psrlq        ($Xi,1);                #
+         &pxor         ($Xi,$T2);              #
+
+       &pclmulqdq      ($T1,$T3,0x00);         #######
+       &movups         ($Hkey,&QWP(16,$Htbl)); # load H^2
+       &xorps          ($T1,$Xn);              #
+       &xorps          ($T1,$Xhn);             #
+
+       &movdqa         ($T3,$T1);              #
+       &psrldq         ($T1,8);
+       &pslldq         ($T3,8);                #
+       &pxor           ($Xhn,$T1);
+       &pxor           ($Xn,$T3);              #
+       &movdqa         ($T3,&QWP(0,$const));
+
+       &lea            ($inp,&DWP(32,$inp));
+       &sub            ($len,0x20);
+       &ja             (&label("mod_loop"));
+
+&set_label("even_tail");
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H^2*(Ii+Xi)
+
+       &pxor           ($Xi,$Xn);              # (H*Ii+1) + H^2*(Ii+Xi)
+       &pxor           ($Xhi,$Xhn);
+
+       &reduction_alg9 ($Xhi,$Xi);
+
+       &test           ($len,$len);
+       &jnz            (&label("done"));
+
+       &movups         ($Hkey,&QWP(0,$Htbl));  # load H
+&set_label("odd_tail");
+       &movdqu         ($T1,&QWP(0,$inp));     # Ii
+       &pshufb         ($T1,$T3);
+       &pxor           ($Xi,$T1);              # Ii+Xi
+
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H*(Ii+Xi)
+       &reduction_alg9 ($Xhi,$Xi);
+
+&set_label("done");
+       &pshufb         ($Xi,$T3);
+       &movdqu         (&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+} else {               # Algorith 5. Kept for reference purposes.
+
+sub reduction_alg5 {   # 19/16 times faster than Intel version
+my ($Xhi,$Xi)address@hidden;
+
+       # <<1
+       &movdqa         ($T1,$Xi);              #
+       &movdqa         ($T2,$Xhi);
+       &pslld          ($Xi,1);
+       &pslld          ($Xhi,1);               #
+       &psrld          ($T1,31);
+       &psrld          ($T2,31);               #
+       &movdqa         ($T3,$T1);
+       &pslldq         ($T1,4);
+       &psrldq         ($T3,12);               #
+       &pslldq         ($T2,4);
+       &por            ($Xhi,$T3);             #
+       &por            ($Xi,$T1);
+       &por            ($Xhi,$T2);             #
+
+       # 1st phase
+       &movdqa         ($T1,$Xi);
+       &movdqa         ($T2,$Xi);
+       &movdqa         ($T3,$Xi);              #
+       &pslld          ($T1,31);
+       &pslld          ($T2,30);
+       &pslld          ($Xi,25);               #
+       &pxor           ($T1,$T2);
+       &pxor           ($T1,$Xi);              #
+       &movdqa         ($T2,$T1);              #
+       &pslldq         ($T1,12);
+       &psrldq         ($T2,4);                #
+       &pxor           ($T3,$T1);
+
+       # 2nd phase
+       &pxor           ($Xhi,$T3);             #
+       &movdqa         ($Xi,$T3);
+       &movdqa         ($T1,$T3);
+       &psrld          ($Xi,1);                #
+       &psrld          ($T1,2);
+       &psrld          ($T3,7);                #
+       &pxor           ($Xi,$T1);
+       &pxor           ($Xhi,$T2);
+       &pxor           ($Xi,$T3);              #
+       &pxor           ($Xi,$Xhi);             #
+}
+
+&function_begin_B("gcm_init_clmul");
+       &mov            ($Htbl,&wparam(0));
+       &mov            ($Xip,&wparam(1));
+
+       &call           (&label("pic"));
+&set_label("pic");
+       &blindpop       ($const);
+       &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+       &movdqu         ($Hkey,&QWP(0,$Xip));
+       &pshufd         ($Hkey,$Hkey,0b01001110);# dword swap
+
+       # calculate H^2
+       &movdqa         ($Xi,$Hkey);
+       &clmul64x64_T3  ($Xhi,$Xi,$Hkey);
+       &reduction_alg5 ($Xhi,$Xi);
+
+       &movdqu         (&QWP(0,$Htbl),$Hkey);  # save H
+       &movdqu         (&QWP(16,$Htbl),$Xi);   # save H^2
+
+       &ret            ();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+       &mov            ($Xip,&wparam(0));
+       &mov            ($Htbl,&wparam(1));
+
+       &call           (&label("pic"));
+&set_label("pic");
+       &blindpop       ($const);
+       &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+       &movdqu         ($Xi,&QWP(0,$Xip));
+       &movdqa         ($Xn,&QWP(0,$const));
+       &movdqu         ($Hkey,&QWP(0,$Htbl));
+       &pshufb         ($Xi,$Xn);
+
+       &clmul64x64_T3  ($Xhi,$Xi,$Hkey);
+       &reduction_alg5 ($Xhi,$Xi);
+
+       &pshufb         ($Xi,$Xn);
+       &movdqu         (&QWP(0,$Xip),$Xi);
+
+       &ret    ();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+       &mov            ($Xip,&wparam(0));
+       &mov            ($Htbl,&wparam(1));
+       &mov            ($inp,&wparam(2));
+       &mov            ($len,&wparam(3));
+
+       &call           (&label("pic"));
+&set_label("pic");
+       &blindpop       ($const);
+       &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+       &movdqu         ($Xi,&QWP(0,$Xip));
+       &movdqa         ($T3,&QWP(0,$const));
+       &movdqu         ($Hkey,&QWP(0,$Htbl));
+       &pshufb         ($Xi,$T3);
+
+       &sub            ($len,0x10);
+       &jz             (&label("odd_tail"));
+
+       #######
+       # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+       #       [(H*Ii+1) + (H*Xi+1)] mod P =
+       #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
+       #
+       &movdqu         ($T1,&QWP(0,$inp));     # Ii
+       &movdqu         ($Xn,&QWP(16,$inp));    # Ii+1
+       &pshufb         ($T1,$T3);
+       &pshufb         ($Xn,$T3);
+       &pxor           ($Xi,$T1);              # Ii+Xi
+
+       &clmul64x64_T3  ($Xhn,$Xn,$Hkey);       # H*Ii+1
+       &movdqu         ($Hkey,&QWP(16,$Htbl)); # load H^2
+
+       &sub            ($len,0x20);
+       &lea            ($inp,&DWP(32,$inp));   # i+=2
+       &jbe            (&label("even_tail"));
+
+&set_label("mod_loop");
+       &clmul64x64_T3  ($Xhi,$Xi,$Hkey);       # H^2*(Ii+Xi)
+       &movdqu         ($Hkey,&QWP(0,$Htbl));  # load H
+
+       &pxor           ($Xi,$Xn);              # (H*Ii+1) + H^2*(Ii+Xi)
+       &pxor           ($Xhi,$Xhn);
+
+       &reduction_alg5 ($Xhi,$Xi);
+
+       #######
+       &movdqa         ($T3,&QWP(0,$const));
+       &movdqu         ($T1,&QWP(0,$inp));     # Ii
+       &movdqu         ($Xn,&QWP(16,$inp));    # Ii+1
+       &pshufb         ($T1,$T3);
+       &pshufb         ($Xn,$T3);
+       &pxor           ($Xi,$T1);              # Ii+Xi
+
+       &clmul64x64_T3  ($Xhn,$Xn,$Hkey);       # H*Ii+1
+       &movdqu         ($Hkey,&QWP(16,$Htbl)); # load H^2
+
+       &sub            ($len,0x20);
+       &lea            ($inp,&DWP(32,$inp));
+       &ja             (&label("mod_loop"));
+
+&set_label("even_tail");
+       &clmul64x64_T3  ($Xhi,$Xi,$Hkey);       # H^2*(Ii+Xi)
+
+       &pxor           ($Xi,$Xn);              # (H*Ii+1) + H^2*(Ii+Xi)
+       &pxor           ($Xhi,$Xhn);
+
+       &reduction_alg5 ($Xhi,$Xi);
+
+       &movdqa         ($T3,&QWP(0,$const));
+       &test           ($len,$len);
+       &jnz            (&label("done"));
+
+       &movdqu         ($Hkey,&QWP(0,$Htbl));  # load H
+&set_label("odd_tail");
+       &movdqu         ($T1,&QWP(0,$inp));     # Ii
+       &pshufb         ($T1,$T3);
+       &pxor           ($Xi,$T1);              # Ii+Xi
+
+       &clmul64x64_T3  ($Xhi,$Xi,$Hkey);       # H*(Ii+Xi)
+       &reduction_alg5 ($Xhi,$Xi);
+
+       &movdqa         ($T3,&QWP(0,$const));
+&set_label("done");
+       &pshufb         ($Xi,$T3);
+       &movdqu         (&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+}
+
+&set_label("bswap",64);
+       &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+       &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
+}}     # $sse2
+
+&set_label("rem_4bit",64);
+       &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
+       &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
+       &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
+       &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
+&set_label("rem_8bit",64);
+       &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
+       &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
+       &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
+       &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
+       &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
+       &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
+       &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
+       &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
+       &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
+       &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
+       &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
+       &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
+       &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
+       &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
+       &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
+       &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
+       &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
+       &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
+       &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
+       &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
+       &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
+       &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
+       &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
+       &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
+       &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
+       &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
+       &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
+       &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
+       &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
+       &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
+       &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
+       &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
+}}}    # !$x86only
+
+&asciz("GHASH for x86, CRYPTOGAMS by <address@hidden>");
+&asm_finish();
+
+# A question was risen about choice of vanilla MMX. Or rather why wasn't
+# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
+# CPUs such as PIII, "4-bit" MMX version was observed to provide better
+# performance than *corresponding* SSE2 one even on contemporary CPUs.
+# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
+# implementation featuring full range of lookup-table sizes, but with
+# per-invocation lookup table setup. Latter means that table size is
+# chosen depending on how much data is to be hashed in every given call,
+# more data - larger table. Best reported result for Core2 is ~4 cycles
+# per processed byte out of 64KB block. This number accounts even for
+# 64KB table setup overhead. As discussed in gcm128.c we choose to be
+# more conservative in respect to lookup table sizes, but how do the
+# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
+# on same platform. As also discussed in gcm128.c, next in line "8-bit
+# Shoup's" or "4KB" method should deliver twice the performance of
+# "256B" one, in other words not worse than ~6 cycles per byte. It
+# should be also be noted that in SSE2 case improvement can be "super-
+# linear," i.e. more than twice, mostly because >>8 maps to single
+# instruction on SSE2 register. This is unlike "4-bit" case when >>4
+# maps to same amount of instructions in both MMX and SSE2 cases.
+# Bottom line is that switch to SSE2 is considered to be justifiable
+# only in case we choose to implement "8-bit" method...
diff --git a/devel/perlasm/ghash-x86_64.pl b/devel/perlasm/ghash-x86_64.pl
new file mode 100644
index 0000000..a5ae180
--- /dev/null
+++ b/devel/perlasm/ghash-x86_64.pl
@@ -0,0 +1,805 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <address@hidden> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that
+# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
+# function features so called "528B" variant utilizing additional
+# 256+16 bytes of per-key storage [+512 bytes shared table].
+# Performance results are for this streamed GHASH subroutine and are
+# expressed in cycles per processed byte, less is better:
+#
+#              gcc 3.4.x(*)    assembler
+#
+# P4           28.6            14.0            +100%
+# Opteron      19.3            7.7             +150%
+# Core2                17.8            8.1(**)         +120%
+#
+# (*)  comparison is not completely fair, because C results are
+#      for vanilla "256B" implementation, while assembler results
+#      are for "528B";-)
+# (**) it's mystery [to me] why Core2 result is not same as for
+#      Opteron;
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
+# See ghash-x86.pl for background information and details about coding
+# techniques.
+#
+# Special thanks to David Woodhouse <address@hidden> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+# common register layout
+$nlo="%rax";
+$nhi="%rbx";
+$Zlo="%r8";
+$Zhi="%r9";
+$tmp="%r10";
+$rem_4bit = "%r11";
+
+$Xi="%rdi";
+$Htbl="%rsi";
+
+# per-function register layout
+$cnt="%rcx";
+$rem="%rdx";
+
+sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/    or
+                       $r =~ s/%[er]([sd]i)/%\1l/      or
+                       $r =~ s/%[er](bp)/%\1l/         or
+                       $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
+
+sub AUTOLOAD()         # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+{ my $N;
+  sub loop() {
+  my $inp = shift;
+
+       $N++;
+$code.=<<___;
+       xor     $nlo,$nlo
+       xor     $nhi,$nhi
+       mov     `&LB("$Zlo")`,`&LB("$nlo")`
+       mov     `&LB("$Zlo")`,`&LB("$nhi")`
+       shl     \$4,`&LB("$nlo")`
+       mov     \$14,$cnt
+       mov     8($Htbl,$nlo),$Zlo
+       mov     ($Htbl,$nlo),$Zhi
+       and     \$0xf0,`&LB("$nhi")`
+       mov     $Zlo,$rem
+       jmp     .Loop$N
+
+.align 16
+.Loop$N:
+       shr     \$4,$Zlo
+       and     \$0xf,$rem
+       mov     $Zhi,$tmp
+       mov     ($inp,$cnt),`&LB("$nlo")`
+       shr     \$4,$Zhi
+       xor     8($Htbl,$nhi),$Zlo
+       shl     \$60,$tmp
+       xor     ($Htbl,$nhi),$Zhi
+       mov     `&LB("$nlo")`,`&LB("$nhi")`
+       xor     ($rem_4bit,$rem,8),$Zhi
+       mov     $Zlo,$rem
+       shl     \$4,`&LB("$nlo")`
+       xor     $tmp,$Zlo
+       dec     $cnt
+       js      .Lbreak$N
+
+       shr     \$4,$Zlo
+       and     \$0xf,$rem
+       mov     $Zhi,$tmp
+       shr     \$4,$Zhi
+       xor     8($Htbl,$nlo),$Zlo
+       shl     \$60,$tmp
+       xor     ($Htbl,$nlo),$Zhi
+       and     \$0xf0,`&LB("$nhi")`
+       xor     ($rem_4bit,$rem,8),$Zhi
+       mov     $Zlo,$rem
+       xor     $tmp,$Zlo
+       jmp     .Loop$N
+
+.align 16
+.Lbreak$N:
+       shr     \$4,$Zlo
+       and     \$0xf,$rem
+       mov     $Zhi,$tmp
+       shr     \$4,$Zhi
+       xor     8($Htbl,$nlo),$Zlo
+       shl     \$60,$tmp
+       xor     ($Htbl,$nlo),$Zhi
+       and     \$0xf0,`&LB("$nhi")`
+       xor     ($rem_4bit,$rem,8),$Zhi
+       mov     $Zlo,$rem
+       xor     $tmp,$Zlo
+
+       shr     \$4,$Zlo
+       and     \$0xf,$rem
+       mov     $Zhi,$tmp
+       shr     \$4,$Zhi
+       xor     8($Htbl,$nhi),$Zlo
+       shl     \$60,$tmp
+       xor     ($Htbl,$nhi),$Zhi
+       xor     $tmp,$Zlo
+       xor     ($rem_4bit,$rem,8),$Zhi
+
+       bswap   $Zlo
+       bswap   $Zhi
+___
+}}
+
+$code=<<___;
+.text
+
+.globl gcm_gmult_4bit
+.type  gcm_gmult_4bit,address@hidden,2
+.align 16
+gcm_gmult_4bit:
+       push    %rbx
+       push    %rbp            # %rbp and %r12 are pushed exclusively in
+       push    %r12            # order to reuse Win64 exception handler...
+.Lgmult_prologue:
+
+       movzb   15($Xi),$Zlo
+       lea     .Lrem_4bit(%rip),$rem_4bit
+___
+       &loop   ($Xi);
+$code.=<<___;
+       mov     $Zlo,8($Xi)
+       mov     $Zhi,($Xi)
+
+       mov     16(%rsp),%rbx
+       lea     24(%rsp),%rsp
+.Lgmult_epilogue:
+       ret
+.size  gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+
+# per-function register layout
+$inp="%rdx";
+$len="%rcx";
+$rem_8bit=$rem_4bit;
+
+$code.=<<___;
+.globl gcm_ghash_4bit
+.type  gcm_ghash_4bit,address@hidden,4
+.align 16
+gcm_ghash_4bit:
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       sub     \$280,%rsp
+.Lghash_prologue:
+       mov     $inp,%r14               # reassign couple of args
+       mov     $len,%r15
+___
+{ my $inp="%r14";
+  my $dat="%edx";
+  my $len="%r15";
+  my @nhi=("%ebx","%ecx");
+  my @rem=("%r12","%r13");
+  my $Hshr4="%rbp";
+
+       &sub    ($Htbl,-128);           # size optimization
+       &lea    ($Hshr4,"16+128(%rsp)");
+       { my @lo =($nlo,$nhi);
+          my @hi =($Zlo,$Zhi);
+
+         &xor  ($dat,$dat);
+         for ($i=0,$j=-2;$i<18;$i++,$j++) {
+           &mov        ("$j(%rsp)",&LB($dat))          if ($i>1);
+           &or         ($lo[0],$tmp)                   if ($i>1);
+           &mov        (&LB($dat),&LB($lo[1]))         if ($i>0 && $i<17);
+           &shr        ($lo[1],4)                      if ($i>0 && $i<17);
+           &mov        ($tmp,$hi[1])                   if ($i>0 && $i<17);
+           &shr        ($hi[1],4)                      if ($i>0 && $i<17);
+           &mov        ("8*$j($Hshr4)",$hi[0])         if ($i>1);
+           &mov        ($hi[0],"16*$i+0-128($Htbl)")   if ($i<16);
+           &shl        (&LB($dat),4)                   if ($i>0 && $i<17);
+           &mov        ("8*$j-128($Hshr4)",$lo[0])     if ($i>1);
+           &mov        ($lo[0],"16*$i+8-128($Htbl)")   if ($i<16);
+           &shl        ($tmp,60)                       if ($i>0 && $i<17);
+
+           push        (@lo,shift(@lo));
+           push        (@hi,shift(@hi));
+         }
+       }
+       &add    ($Htbl,-128);
+       &mov    ($Zlo,"8($Xi)");
+       &mov    ($Zhi,"0($Xi)");
+       &add    ($len,$inp);            # pointer to the end of data
+       &lea    ($rem_8bit,".Lrem_8bit(%rip)");
+       &jmp    (".Louter_loop");
+
+$code.=".align 16\n.Louter_loop:\n";
+       &xor    ($Zhi,"($inp)");
+       &mov    ("%rdx","8($inp)");
+       &lea    ($inp,"16($inp)");
+       &xor    ("%rdx",$Zlo);
+       &mov    ("($Xi)",$Zhi);
+       &mov    ("8($Xi)","%rdx");
+       &shr    ("%rdx",32);
+
+       &xor    ($nlo,$nlo);
+       &rol    ($dat,8);
+       &mov    (&LB($nlo),&LB($dat));
+       &movz   ($nhi[0],&LB($dat));
+       &shl    (&LB($nlo),4);
+       &shr    ($nhi[0],4);
+
+       for ($j=11,$i=0;$i<15;$i++) {
+           &rol        ($dat,8);
+           &xor        ($Zlo,"8($Htbl,$nlo)")                  if ($i>0);
+           &xor        ($Zhi,"($Htbl,$nlo)")                   if ($i>0);
+           &mov        ($Zlo,"8($Htbl,$nlo)")                  if ($i==0);
+           &mov        ($Zhi,"($Htbl,$nlo)")                   if ($i==0);
+
+           &mov        (&LB($nlo),&LB($dat));
+           &xor        ($Zlo,$tmp)                             if ($i>0);
+           &movzw      ($rem[1],"($rem_8bit,$rem[1],2)")       if ($i>0);
+
+           &movz       ($nhi[1],&LB($dat));
+           &shl        (&LB($nlo),4);
+           &movzb      ($rem[0],"(%rsp,$nhi[0])");
+
+           &shr        ($nhi[1],4)                             if ($i<14);
+           &and        ($nhi[1],0xf0)                          if ($i==14);
+           &shl        ($rem[1],48)                            if ($i>0);
+           &xor        ($rem[0],$Zlo);
+
+           &mov        ($tmp,$Zhi);
+           &xor        ($Zhi,$rem[1])                          if ($i>0);
+           &shr        ($Zlo,8);
+
+           &movz       ($rem[0],&LB($rem[0]));
+           &mov        ($dat,"$j($Xi)")                        if (--$j%4==0);
+           &shr        ($Zhi,8);
+
+           &xor        ($Zlo,"-128($Hshr4,$nhi[0],8)");
+           &shl        ($tmp,56);
+           &xor        ($Zhi,"($Hshr4,$nhi[0],8)");
+
+           unshift     (@nhi,pop(@nhi));               # "rotate" registers
+           unshift     (@rem,pop(@rem));
+       }
+       &movzw  ($rem[1],"($rem_8bit,$rem[1],2)");
+       &xor    ($Zlo,"8($Htbl,$nlo)");
+       &xor    ($Zhi,"($Htbl,$nlo)");
+
+       &shl    ($rem[1],48);
+       &xor    ($Zlo,$tmp);
+
+       &xor    ($Zhi,$rem[1]);
+       &movz   ($rem[0],&LB($Zlo));
+       &shr    ($Zlo,4);
+
+       &mov    ($tmp,$Zhi);
+       &shl    (&LB($rem[0]),4);
+       &shr    ($Zhi,4);
+
+       &xor    ($Zlo,"8($Htbl,$nhi[0])");
+       &movzw  ($rem[0],"($rem_8bit,$rem[0],2)");
+       &shl    ($tmp,60);
+
+       &xor    ($Zhi,"($Htbl,$nhi[0])");
+       &xor    ($Zlo,$tmp);
+       &shl    ($rem[0],48);
+
+       &bswap  ($Zlo);
+       &xor    ($Zhi,$rem[0]);
+
+       &bswap  ($Zhi);
+       &cmp    ($inp,$len);
+       &jb     (".Louter_loop");
+}
+$code.=<<___;
+       mov     $Zlo,8($Xi)
+       mov     $Zhi,($Xi)
+
+       lea     280(%rsp),%rsi
+       mov     0(%rsi),%r15
+       mov     8(%rsi),%r14
+       mov     16(%rsi),%r13
+       mov     24(%rsi),%r12
+       mov     32(%rsi),%rbp
+       mov     40(%rsi),%rbx
+       lea     48(%rsi),%rsp
+.Lghash_epilogue:
+       ret
+.size  gcm_ghash_4bit,.-gcm_ghash_4bit
+___
+
+######################################################################
+# PCLMULQDQ version.
+
address@hidden  ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
+               ("%rdi","%rsi","%rdx","%rcx");  # Unix order
+
+($Xi,$Xhi)=("%xmm0","%xmm1");  $Hkey="%xmm2";
+($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
+
+sub clmul64x64_T2 {    # minimal register pressure
+my ($Xhi,$Xi,$Hkey,$modulo)address@hidden;
+
+$code.=<<___ if (!defined($modulo));
+       movdqa          $Xi,$Xhi                #
+       pshufd          \$0b01001110,$Xi,$T1
+       pshufd          \$0b01001110,$Hkey,$T2
+       pxor            $Xi,$T1                 #
+       pxor            $Hkey,$T2
+___
+$code.=<<___;
+       pclmulqdq       \$0x00,$Hkey,$Xi        #######
+       pclmulqdq       \$0x11,$Hkey,$Xhi       #######
+       pclmulqdq       \$0x00,$T2,$T1          #######
+       pxor            $Xi,$T1                 #
+       pxor            $Xhi,$T1                #
+
+       movdqa          $T1,$T2                 #
+       psrldq          \$8,$T1
+       pslldq          \$8,$T2                 #
+       pxor            $T1,$Xhi
+       pxor            $T2,$Xi                 #
+___
+}
+
+sub reduction_alg9 {   # 17/13 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+       # 1st phase
+       movdqa          $Xi,$T1                 #
+       psllq           \$1,$Xi
+       pxor            $T1,$Xi                 #
+       psllq           \$5,$Xi                 #
+       pxor            $T1,$Xi                 #
+       psllq           \$57,$Xi                #
+       movdqa          $Xi,$T2                 #
+       pslldq          \$8,$Xi
+       psrldq          \$8,$T2                 #       
+       pxor            $T1,$Xi
+       pxor            $T2,$Xhi                #
+
+       # 2nd phase
+       movdqa          $Xi,$T2
+       psrlq           \$5,$Xi
+       pxor            $T2,$Xi                 #
+       psrlq           \$1,$Xi                 #
+       pxor            $T2,$Xi                 #
+       pxor            $Xhi,$T2
+       psrlq           \$1,$Xi                 #
+       pxor            $T2,$Xi                 #
+___
+}
+
+{ my ($Htbl,$Xip)address@hidden;
+
+$code.=<<___;
+.globl gcm_init_clmul
+.type  gcm_init_clmul,address@hidden
+.align 16
+gcm_init_clmul:
+       movdqu          ($Xip),$Hkey
+       pshufd          \$0b01001110,$Hkey,$Hkey        # dword swap
+
+       # <<1 twist
+       pshufd          \$0b11111111,$Hkey,$T2  # broadcast uppermost dword
+       movdqa          $Hkey,$T1
+       psllq           \$1,$Hkey
+       pxor            $T3,$T3                 #
+       psrlq           \$63,$T1
+       pcmpgtd         $T2,$T3                 # broadcast carry bit
+       pslldq          \$8,$T1
+       por             $T1,$Hkey               # H<<=1
+
+       # magic reduction
+       pand            .L0x1c2_polynomial(%rip),$T3
+       pxor            $T3,$Hkey               # if(carry) H^=0x1c2_polynomial
+
+       # calculate H^2
+       movdqa          $Hkey,$Xi
+___
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
+       &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+       movdqu          $Hkey,($Htbl)           # save H
+       movdqu          $Xi,16($Htbl)           # save H^2
+       ret
+.size  gcm_init_clmul,.-gcm_init_clmul
+___
+}
+
+{ my ($Xip,$Htbl)address@hidden;
+
+$code.=<<___;
+.globl gcm_gmult_clmul
+.type  gcm_gmult_clmul,address@hidden
+.align 16
+gcm_gmult_clmul:
+       movdqu          ($Xip),$Xi
+       movdqa          .Lbswap_mask(%rip),$T3
+       movdqu          ($Htbl),$Hkey
+       pshufb          $T3,$Xi
+___
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
+       &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+       pshufb          $T3,$Xi
+       movdqu          $Xi,($Xip)
+       ret
+.size  gcm_gmult_clmul,.-gcm_gmult_clmul
+___
+}
+
+{ my ($Xip,$Htbl,$inp,$len)address@hidden;
+  my $Xn="%xmm6";
+  my $Xhn="%xmm7";
+  my $Hkey2="%xmm8";
+  my $T1n="%xmm9";
+  my $T2n="%xmm10";
+
+$code.=<<___;
+.globl gcm_ghash_clmul
+.type  gcm_ghash_clmul,address@hidden
+.align 16
+gcm_ghash_clmul:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_ghash_clmul:
+       # I can't trust assembler to use specific encoding:-(
+       .byte   0x48,0x83,0xec,0x58             #sub    \$0x58,%rsp
+       .byte   0x0f,0x29,0x34,0x24             #movaps %xmm6,(%rsp)
+       .byte   0x0f,0x29,0x7c,0x24,0x10        #movdqa %xmm7,0x10(%rsp)
+       .byte   0x44,0x0f,0x29,0x44,0x24,0x20   #movaps %xmm8,0x20(%rsp)
+       .byte   0x44,0x0f,0x29,0x4c,0x24,0x30   #movaps %xmm9,0x30(%rsp)
+       .byte   0x44,0x0f,0x29,0x54,0x24,0x40   #movaps %xmm10,0x40(%rsp)
+___
+$code.=<<___;
+       movdqa          .Lbswap_mask(%rip),$T3
+
+       movdqu          ($Xip),$Xi
+       movdqu          ($Htbl),$Hkey
+       pshufb          $T3,$Xi
+
+       sub             \$0x10,$len
+       jz              .Lodd_tail
+
+       movdqu          16($Htbl),$Hkey2
+       #######
+       # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+       #       [(H*Ii+1) + (H*Xi+1)] mod P =
+       #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
+       #
+       movdqu          ($inp),$T1              # Ii
+       movdqu          16($inp),$Xn            # Ii+1
+       pshufb          $T3,$T1
+       pshufb          $T3,$Xn
+       pxor            $T1,$Xi                 # Ii+Xi
+___
+       &clmul64x64_T2  ($Xhn,$Xn,$Hkey);       # H*Ii+1
+$code.=<<___;
+       movdqa          $Xi,$Xhi                #
+       pshufd          \$0b01001110,$Xi,$T1
+       pshufd          \$0b01001110,$Hkey2,$T2
+       pxor            $Xi,$T1                 #
+       pxor            $Hkey2,$T2
+
+       lea             32($inp),$inp           # i+=2
+       sub             \$0x20,$len
+       jbe             .Leven_tail
+
+.Lmod_loop:
+___
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey2,1);    # H^2*(Ii+Xi)
+$code.=<<___;
+       movdqu          ($inp),$T1              # Ii
+       pxor            $Xn,$Xi                 # (H*Ii+1) + H^2*(Ii+Xi)
+       pxor            $Xhn,$Xhi
+
+       movdqu          16($inp),$Xn            # Ii+1
+       pshufb          $T3,$T1
+       pshufb          $T3,$Xn
+
+       movdqa          $Xn,$Xhn                #
+       pshufd          \$0b01001110,$Xn,$T1n
+       pshufd          \$0b01001110,$Hkey,$T2n
+       pxor            $Xn,$T1n                #
+       pxor            $Hkey,$T2n
+        pxor           $T1,$Xhi                # "Ii+Xi", consume early
+
+         movdqa        $Xi,$T1                 # 1st phase
+         psllq         \$1,$Xi
+         pxor          $T1,$Xi                 #
+         psllq         \$5,$Xi                 #
+         pxor          $T1,$Xi                 #
+       pclmulqdq       \$0x00,$Hkey,$Xn        #######
+         psllq         \$57,$Xi                #
+         movdqa        $Xi,$T2                 #
+         pslldq        \$8,$Xi
+         psrldq        \$8,$T2                 #       
+         pxor          $T1,$Xi
+         pxor          $T2,$Xhi                #
+
+       pclmulqdq       \$0x11,$Hkey,$Xhn       #######
+         movdqa        $Xi,$T2                 # 2nd phase
+         psrlq         \$5,$Xi
+         pxor          $T2,$Xi                 #
+         psrlq         \$1,$Xi                 #
+         pxor          $T2,$Xi                 #
+         pxor          $Xhi,$T2
+         psrlq         \$1,$Xi                 #
+         pxor          $T2,$Xi                 #
+
+       pclmulqdq       \$0x00,$T2n,$T1n        #######
+        movdqa         $Xi,$Xhi                #
+        pshufd         \$0b01001110,$Xi,$T1
+        pshufd         \$0b01001110,$Hkey2,$T2
+        pxor           $Xi,$T1                 #
+        pxor           $Hkey2,$T2
+
+       pxor            $Xn,$T1n                #
+       pxor            $Xhn,$T1n               #
+       movdqa          $T1n,$T2n               #
+       psrldq          \$8,$T1n
+       pslldq          \$8,$T2n                #
+       pxor            $T1n,$Xhn
+       pxor            $T2n,$Xn                #
+
+       lea             32($inp),$inp
+       sub             \$0x20,$len
+       ja              .Lmod_loop
+
+.Leven_tail:
+___
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey2,1);    # H^2*(Ii+Xi)
+$code.=<<___;
+       pxor            $Xn,$Xi                 # (H*Ii+1) + H^2*(Ii+Xi)
+       pxor            $Xhn,$Xhi
+___
+       &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+       test            $len,$len
+       jnz             .Ldone
+
+.Lodd_tail:
+       movdqu          ($inp),$T1              # Ii
+       pshufb          $T3,$T1
+       pxor            $T1,$Xi                 # Ii+Xi
+___
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H*(Ii+Xi)
+       &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+.Ldone:
+       pshufb          $T3,$Xi
+       movdqu          $Xi,($Xip)
+___
+$code.=<<___ if ($win64);
+       movaps  (%rsp),%xmm6
+       movaps  0x10(%rsp),%xmm7
+       movaps  0x20(%rsp),%xmm8
+       movaps  0x30(%rsp),%xmm9
+       movaps  0x40(%rsp),%xmm10
+       add     \$0x58,%rsp
+___
+$code.=<<___;
+       ret
+.LSEH_end_gcm_ghash_clmul:
+.size  gcm_ghash_clmul,.-gcm_ghash_clmul
+___
+}
+
+$code.=<<___;
+.align 64
+.Lbswap_mask:
+       .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+       .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.align 64
+.type  .Lrem_4bit,address@hidden
+.Lrem_4bit:
+       .long   0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
+       .long   0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
+       .long   0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
+       .long   0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
+.type  .Lrem_8bit,address@hidden
+.Lrem_8bit:
+       .value  0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+       .value  0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+       .value  0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+       .value  0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+       .value  0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+       .value  0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+       .value  0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+       .value  0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+       .value  0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+       .value  0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+       .value  0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+       .value  0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+       .value  0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+       .value  0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+       .value  0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+       .value  0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+       .value  0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+       .value  0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+       .value  0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+       .value  0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+       .value  0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+       .value  0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+       .value  0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+       .value  0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+       .value  0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+       .value  0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+       .value  0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+       .value  0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+       .value  0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+       .value  0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+       .value  0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+       .value  0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.asciz "GHASH for x86_64, CRYPTOGAMS by <address@hidden>"
+.align 64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#              CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern        __imp_RtlVirtualUnwind
+.type  se_handler,address@hidden
+.align 16
+se_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       mov     8($disp),%rsi           # disp->ImageBase
+       mov     56($disp),%r11          # disp->HandlerData
+
+       mov     0(%r11),%r10d           # HandlerData[0]
+       lea     (%rsi,%r10),%r10        # prologue label
+       cmp     %r10,%rbx               # context->Rip<prologue label
+       jb      .Lin_prologue
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # epilogue label
+       cmp     %r10,%rbx               # context->Rip>=epilogue label
+       jae     .Lin_prologue
+
+       lea     24(%rax),%rax           # adjust "rsp"
+
+       mov     -8(%rax),%rbx
+       mov     -16(%rax),%rbp
+       mov     -24(%rax),%r12
+       mov     %rbx,144($context)      # restore context->Rbx
+       mov     %rbp,160($context)      # restore context->Rbp
+       mov     %r12,216($context)      # restore context->R12
+
+.Lin_prologue:
+       mov     8(%rax),%rdi
+       mov     16(%rax),%rsi
+       mov     %rax,152($context)      # restore context->Rsp
+       mov     %rsi,168($context)      # restore context->Rsi
+       mov     %rdi,176($context)      # restore context->Rdi
+
+       mov     40($disp),%rdi          # disp->ContextRecord
+       mov     $context,%rsi           # context
+       mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
+       .long   0xa548f3fc              # cld; rep movsq
+
+       mov     $disp,%rsi
+       xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+       mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+       mov     0(%rsi),%r8             # arg3, disp->ControlPc
+       mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+       mov     40(%rsi),%r10           # disp->ContextRecord
+       lea     56(%rsi),%r11           # &disp->HandlerData
+       lea     24(%rsi),%r12           # &disp->EstablisherFrame
+       mov     %r10,32(%rsp)           # arg5
+       mov     %r11,40(%rsp)           # arg6
+       mov     %r12,48(%rsp)           # arg7
+       mov     %rcx,56(%rsp)           # arg8, (NULL)
+       call    *__imp_RtlVirtualUnwind(%rip)
+
+       mov     \$1,%eax                # ExceptionContinueSearch
+       add     \$64,%rsp
+       popfq
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbp
+       pop     %rbx
+       pop     %rdi
+       pop     %rsi
+       ret
+.size  se_handler,.-se_handler
+
+.section       .pdata
+.align 4
+       .rva    .LSEH_begin_gcm_gmult_4bit
+       .rva    .LSEH_end_gcm_gmult_4bit
+       .rva    .LSEH_info_gcm_gmult_4bit
+
+       .rva    .LSEH_begin_gcm_ghash_4bit
+       .rva    .LSEH_end_gcm_ghash_4bit
+       .rva    .LSEH_info_gcm_ghash_4bit
+
+       .rva    .LSEH_begin_gcm_ghash_clmul
+       .rva    .LSEH_end_gcm_ghash_clmul
+       .rva    .LSEH_info_gcm_ghash_clmul
+
+.section       .xdata
+.align 8
+.LSEH_info_gcm_gmult_4bit:
+       .byte   9,0,0,0
+       .rva    se_handler
+       .rva    .Lgmult_prologue,.Lgmult_epilogue       # HandlerData
+.LSEH_info_gcm_ghash_4bit:
+       .byte   9,0,0,0
+       .rva    se_handler
+       .rva    .Lghash_prologue,.Lghash_epilogue       # HandlerData
+.LSEH_info_gcm_ghash_clmul:
+       .byte   0x01,0x1f,0x0b,0x00
+       .byte   0x1f,0xa8,0x04,0x00     #movaps 0x40(rsp),xmm10
+       .byte   0x19,0x98,0x03,0x00     #movaps 0x30(rsp),xmm9
+       .byte   0x13,0x88,0x02,0x00     #movaps 0x20(rsp),xmm8
+       .byte   0x0d,0x78,0x01,0x00     #movaps 0x10(rsp),xmm7
+       .byte   0x08,0x68,0x00,0x00     #movaps (rsp),xmm6
+       .byte   0x04,0xa2,0x00,0x00     #sub    rsp,0x58
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/devel/perlasm/license-gnutls.txt b/devel/perlasm/license-gnutls.txt
new file mode 100644
index 0000000..4201a66
--- /dev/null
+++ b/devel/perlasm/license-gnutls.txt
@@ -0,0 +1,20 @@
+#
+# Copyright (C) 2011 Free Software Foundation, Inc.
+#
+# Author: Nikos Mavrogiannopoulos
+#
+# This file is part of GnuTLS.
+#
+# The GnuTLS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 3 of
+# the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
diff --git a/devel/perlasm/license.txt b/devel/perlasm/license.txt
new file mode 100644
index 0000000..b1b2b21
--- /dev/null
+++ b/devel/perlasm/license.txt
@@ -0,0 +1,37 @@
+# Copyright (c) 2011, Andy Polyakov by <address@hidden>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 
+#     * Redistributions of source code must retain copyright notices,
+#      this list of conditions and the following disclaimer.
+#
+#     * Redistributions in binary form must reproduce the above
+#      copyright notice, this list of conditions and the following
+#      disclaimer in the documentation and/or other materials
+#      provided with the distribution.
+#
+#     * Neither the name of the Andy Polyakov nor the names of its
+#      copyright holder and contributors may be used to endorse or
+#      promote products derived from this software without specific
+#      prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/devel/perlasm/ppc-xlate.pl b/devel/perlasm/ppc-xlate.pl
new file mode 100755
index 0000000..a3edd98
--- /dev/null
+++ b/devel/perlasm/ppc-xlate.pl
@@ -0,0 +1,159 @@
+#!/usr/bin/env perl
+
+# PowerPC assembler distiller by <appro>.
+
+my $flavour = shift;
+my $output = shift;
+open STDOUT,">$output" || die "can't open $output: $!";
+
+my %GLOBALS;
+my $dotinlocallabels=($flavour=~/linux/)?1:0;
+
+################################################################
+# directives which need special treatment on different platforms
+################################################################
+my $globl = sub {
+    my $junk = shift;
+    my $name = shift;
+    my $global = \$GLOBALS{$name};
+    my $ret;
+
+    $name =~ s|^[\.\_]||;
+ 
+    SWITCH: for ($flavour) {
+       /aix/           && do { $name = ".$name";
+                               last;
+                             };
+       /osx/           && do { $name = "_$name";
+                               last;
+                             };
+       /linux.*32/     && do { $ret .= ".globl $name\n";
+                               $ret .= ".type  $name,address@hidden";
+                               last;
+                             };
+       /linux.*64/     && do { $ret .= ".globl $name\n";
+                               $ret .= ".type  $name,address@hidden";
+                               $ret .= ".section       \".opd\",\"aw\"\n";
+                               $ret .= ".align 3\n";
+                               $ret .= "$name:\n";
+                               $ret .= ".quad  .$name,address@hidden,0\n";
+                               $ret .= ".size  $name,24\n";
+                               $ret .= ".previous\n";
+
+                               $name = ".$name";
+                               last;
+                             };
+    }
+
+    $ret = ".globl     $name" if (!$ret);
+    $$global = $name;
+    $ret;
+};
+my $text = sub {
+    ($flavour =~ /aix/) ? ".csect" : ".text";
+};
+my $machine = sub {
+    my $junk = shift;
+    my $arch = shift;
+    if ($flavour =~ /osx/)
+    {  $arch =~ s/\"//g;
+       $arch = ($flavour=~/64/) ? "ppc970-64" : "ppc970" if ($arch eq "any");
+    }
+    ".machine  $arch";
+};
+my $size = sub {
+    if ($flavour =~ /linux.*32/)
+    {  shift;
+       ".size  " . join(",",@_);
+    }
+    else
+    {  "";     }
+};
+my $asciz = sub {
+    shift;
+    my $line = join(",",@_);
+    if ($line =~ /^"(.*)"$/)
+    {  ".byte  " . join(",",unpack("C*",$1),0) . "\n.align     2";     }
+    else
+    {  "";     }
+};
+
+################################################################
+# simplified mnemonics not handled by at least one assembler
+################################################################
+my $cmplw = sub {
+    my $f = shift;
+    my $cr = 0; $cr = shift if ($#_>1);
+    # Some out-of-date 32-bit GNU assembler just can't handle cmplw...
+    ($flavour =~ /linux.*32/) ?
+       "       .long   ".sprintf "0x%x",31<<26|$cr<<23|$_[0]<<16|$_[1]<<11|64 :
+       "       cmplw   ".join(',',$cr,@_);
+};
+my $bdnz = sub {
+    my $f = shift;
+    my $bo = $f=~/[\+\-]/ ? 16+9 : 16; # optional "to be taken" hint
+    "  bc      $bo,0,".shift;
+} if ($flavour!~/linux/);
+my $bltlr = sub {
+    my $f = shift;
+    my $bo = $f=~/\-/ ? 12+2 : 12;     # optional "not to be taken" hint
+    ($flavour =~ /linux/) ?            # GNU as doesn't allow most recent hints
+       "       .long   ".sprintf "0x%x",19<<26|$bo<<21|16<<1 :
+       "       bclr    $bo,0";
+};
+my $bnelr = sub {
+    my $f = shift;
+    my $bo = $f=~/\-/ ? 4+2 : 4;       # optional "not to be taken" hint
+    ($flavour =~ /linux/) ?            # GNU as doesn't allow most recent hints
+       "       .long   ".sprintf "0x%x",19<<26|$bo<<21|2<<16|16<<1 :
+       "       bclr    $bo,2";
+};
+my $beqlr = sub {
+    my $f = shift;
+    my $bo = $f=~/-/ ? 12+2 : 12;      # optional "not to be taken" hint
+    ($flavour =~ /linux/) ?            # GNU as doesn't allow most recent hints
+       "       .long   ".sprintf "0x%X",19<<26|$bo<<21|2<<16|16<<1 :
+       "       bclr    $bo,2";
+};
+# GNU assembler can't handle extrdi rA,rS,16,48, or when sum of last two
+# arguments is 64, with "operand out of range" error.
+my $extrdi = sub {
+    my ($f,$ra,$rs,$n,$b) = @_;
+    $b = ($b+$n)&63; $n = 64-$n;
+    "  rldicl  $ra,$rs,$b,$n";
+};
+
+while($line=<>) {
+
+    $line =~ s|[#!;].*$||;     # get rid of asm-style comments...
+    $line =~ s|/\*.*\*/||;     # ... and C-style comments...
+    $line =~ s|^\s+||;         # ... and skip white spaces in beginning...
+    $line =~ s|\s+$||;         # ... and at the end
+
+    {
+       $line =~ s|\b\.L(\w+)|L$1|g;    # common denominator for Locallabel
+       $line =~ s|\bL(\w+)|\.L$1|g     if ($dotinlocallabels);
+    }
+
+    {
+       $line =~ s|(^[\.\w]+)\:\s*||;
+       my $label = $1;
+       printf "%s:",($GLOBALS{$label} or $label) if ($label);
+    }
+
+    {
+       $line =~ s|^\s*(\.?)(\w+)([\.\+\-]?)\s*||;
+       my $c = $1; $c = "\t" if ($c eq "");
+       my $mnemonic = $2;
+       my $f = $3;
+       my $opcode = eval("\$$mnemonic");
+       $line =~ s|\bc?[rf]([0-9]+)\b|$1|g if ($c ne "." and $flavour !~ /osx/);
+       if (ref($opcode) eq 'CODE') { $line = &$opcode($f,split(',',$line)); }
+       elsif ($mnemonic)           { $line = $c.$mnemonic.$f."\t".$line; }
+    }
+
+    print $line if ($line);
+    print "\n";
+}
+
+close STDOUT;
diff --git a/devel/perlasm/readme b/devel/perlasm/readme
new file mode 100644
index 0000000..f02bbee
--- /dev/null
+++ b/devel/perlasm/readme
@@ -0,0 +1,124 @@
+The perl scripts in this directory are my 'hack' to generate
+multiple different assembler formats via the one origional script.
+
+The way to use this library is to start with adding the path to this directory
+and then include it.
+
+push(@INC,"perlasm","../../perlasm");
+require "x86asm.pl";
+
+The first thing we do is setup the file and type of assember
+
+&asm_init($ARGV[0],$0);
+
+The first argument is the 'type'.  Currently
+'cpp', 'sol', 'a.out', 'elf' or 'win32'.
+Argument 2 is the file name.
+
+The reciprocal function is
+&asm_finish() which should be called at the end.
+
+There are 2 main 'packages'. x86ms.pl, which is the microsoft assembler,
+and x86unix.pl which is the unix (gas) version.
+
+Functions of interest are:
+&external_label("des_SPtrans");        declare and external variable
+&LB(reg);                      Low byte for a register
+&HB(reg);                      High byte for a register
+&BP(off,base,index,scale)      Byte pointer addressing
+&DWP(off,base,index,scale)     Word pointer addressing
+&stack_push(num)               Basically a 'sub esp, num*4' with extra
+&stack_pop(num)                        inverse of stack_push
+&function_begin(name,extra)    Start a function with pushing of
+                               edi, esi, ebx and ebp.  extra is extra win32
+                               external info that may be required.
+&function_begin_B(name,extra)  Same as norma function_begin but no pushing.
+&function_end(name)            Call at end of function.
+&function_end_A(name)          Standard pop and ret, for use inside functions
+&function_end_B(name)          Call at end but with poping or 'ret'.
+&swtmp(num)                    Address on stack temp word.
+&wparam(num)                   Parameter number num, that was push
+                               in C convention.  This all works over pushes
+                               and pops.
+&comment("hello there")                Put in a comment.
+&label("loop")                 Refer to a label, normally a jmp target.
+&set_label("loop")             Set a label at this point.
+&data_word(word)               Put in a word of data.
+
+So how does this all hold together?  Given
+
+int calc(int len, int *data)
+       {
+       int i,j=0;
+
+       for (i=0; i<len; i++)
+               {
+               j+=other(data[i]);
+               }
+       }
+
+So a very simple version of this function could be coded as
+
+       push(@INC,"perlasm","../../perlasm");
+       require "x86asm.pl";
+       
+       &asm_init($ARGV[0],"cacl.pl");
+
+       &external_label("other");
+
+       $tmp1=  "eax";
+       $j=     "edi";
+       $data=  "esi";
+       $i=     "ebp";
+
+       &comment("a simple function");
+       &function_begin("calc");
+       &mov(   $data,          &wparam(1)); # data
+       &xor(   $j,             $j);
+       &xor(   $i,             $i);
+
+       &set_label("loop");
+       &cmp(   $i,             &wparam(0));
+       &jge(   &label("end"));
+
+       &mov(   $tmp1,          &DWP(0,$data,$i,4));
+       &push(  $tmp1);
+       &call(  "other");
+       &add(   $j,             "eax");
+       &pop(   $tmp1);
+       &inc(   $i);
+       &jmp(   &label("loop"));
+
+       &set_label("end");
+       &mov(   "eax",          $j);
+
+       &function_end("calc");
+
+       &asm_finish();
+
+The above example is very very unoptimised but gives an idea of how
+things work.
+
+There is also a cbc mode function generator in cbc.pl
+
+&cbc(  $name,
+       $encrypt_function_name,
+       $decrypt_function_name,
+       $true_if_byte_swap_needed,
+       $parameter_number_for_iv,
+       $parameter_number_for_encrypt_flag,
+       $first_parameter_to_pass,
+       $second_parameter_to_pass,
+       $third_parameter_to_pass);
+
+So for example, given
+void BF_encrypt(BF_LONG *data,BF_KEY *key);
+void BF_decrypt(BF_LONG *data,BF_KEY *key);
+void BF_cbc_encrypt(unsigned char *in, unsigned char *out, long length,
+        BF_KEY *ks, unsigned char *iv, int enc);
+
+&cbc("BF_cbc_encrypt","BF_encrypt","BF_encrypt",1,4,5,3,-1,-1);
+
+&cbc("des_ncbc_encrypt","des_encrypt","des_encrypt",0,4,5,3,5,-1);
+&cbc("des_ede3_cbc_encrypt","des_encrypt3","des_decrypt3",0,6,7,3,4,5);
+
diff --git a/devel/perlasm/x86_64-xlate.pl b/devel/perlasm/x86_64-xlate.pl
new file mode 100755
index 0000000..1f4ce0a
--- /dev/null
+++ b/devel/perlasm/x86_64-xlate.pl
@@ -0,0 +1,1083 @@
+#!/usr/bin/env perl
+
+# Ascetic x86_64 AT&T to MASM/NASM assembler translator by <appro>.
+#
+# Why AT&T to MASM and not vice versa? Several reasons. Because AT&T
+# format is way easier to parse. Because it's simpler to "gear" from
+# Unix ABI to Windows one [see cross-reference "card" at the end of
+# file]. Because Linux targets were available first...
+#
+# In addition the script also "distills" code suitable for GNU
+# assembler, so that it can be compiled with more rigid assemblers,
+# such as Solaris /usr/ccs/bin/as.
+#
+# This translator is not designed to convert *arbitrary* assembler
+# code from AT&T format to MASM one. It's designed to convert just
+# enough to provide for dual-ABI OpenSSL modules development...
+# There *are* limitations and you might have to modify your assembler
+# code or this script to achieve the desired result...
+#
+# Currently recognized limitations:
+#
+# - can't use multiple ops per line;
+#
+# Dual-ABI styling rules.
+#
+# 1. Adhere to Unix register and stack layout [see cross-reference
+#    ABI "card" at the end for explanation].
+# 2. Forget about "red zone," stick to more traditional blended
+#    stack frame allocation. If volatile storage is actually required
+#    that is. If not, just leave the stack as is.
+# 3. Functions tagged with ".type name,@function" get crafted with
+#    unified Win64 prologue and epilogue automatically. If you want
+#    to take care of ABI differences yourself, tag functions as
+#    ".type name,@abi-omnipotent" instead.
+# 4. To optimize the Win64 prologue you can specify number of input
+#    arguments as ".type name,@function,N." Keep in mind that if N is
+#    larger than 6, then you *have to* write "abi-omnipotent" code,
+#    because >6 cases can't be addressed with unified prologue.
+# 5. Name local labels as .L*, do *not* use dynamic labels such as 1:
+#    (sorry about latter).
+# 6. Don't use [or hand-code with .byte] "rep ret." "ret" mnemonic is
+#    required to identify the spots, where to inject Win64 epilogue!
+#    But on the pros, it's then prefixed with rep automatically:-)
+# 7. Stick to explicit ip-relative addressing. If you have to use
+#    GOTPCREL addressing, stick to mov address@hidden(%rip),%r??.
+#    Both are recognized and translated to proper Win64 addressing
+#    modes. To support legacy code a synthetic directive, .picmeup,
+#    is implemented. It puts address of the *next* instruction into
+#    target register, e.g.:
+#
+#              .picmeup        %rax
+#              lea             .Label-.(%rax),%rax
+#
+# 8. In order to provide for structured exception handling unified
+#    Win64 prologue copies %rsp value to %rax. For further details
+#    see SEH paragraph at the end.
+# 9. .init segment is allowed to contain calls to functions only.
+# a. If function accepts more than 4 arguments *and* >4th argument
+#    is declared as non 64-bit value, do clear its upper part.
+
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+{ my ($stddev,$stdino,@junk)=stat(STDOUT);
+  my ($outdev,$outino,@junk)=stat($output);
+
+    open STDOUT,">$output" || die "can't open $output: $!"
+       if ($stddev!=$outdev || $stdino!=$outino);
+}
+
+my $gas=1;     $gas=0 if ($output =~ /\.asm$/);
+my $elf=1;     $elf=0 if (!$gas);
+my $win64=0;
+my $prefix="";
+my $decor=".L";
+
+my $masmref=8 + 50727*2**-32;  # 8.00.50727 shipped with VS2005
+my $masm=0;
+my $PTR=" PTR";
+
+my $nasmref=2.03;
+my $nasm=0;
+
+if    ($flavour eq "mingw64")  { $gas=1; $elf=0; $win64=1;
+                                 $prefix=`echo __USER_LABEL_PREFIX__ | 
$ENV{CC} -E -P -`;
+                                 chomp($prefix);
+                               }
+elsif ($flavour eq "macosx")   { $gas=1; $elf=0; $prefix="_"; $decor="L\$"; }
+elsif ($flavour eq "masm")     { $gas=0; $elf=0; $masm=$masmref; $win64=1; 
$decor="\$L\$"; }
+elsif ($flavour eq "nasm")     { $gas=0; $elf=0; $nasm=$nasmref; $win64=1; 
$decor="\$L\$"; $PTR=""; }
+elsif (!$gas)
+{   if ($ENV{ASM} =~ m/nasm/ && `nasm -v` =~ m/version ([0-9]+)\.([0-9]+)/i)
+    {  $nasm = $1 + $2*0.01; $PTR="";  }
+    elsif (`ml64 2>&1` =~ m/Version ([0-9]+)\.([0-9]+)(\.([0-9]+))?/)
+    {  $masm = $1 + $2*2**-16 + $4*2**-32;   }
+    die "no assembler found on %PATH" if (!($nasm || $masm));
+    $win64=1;
+    $elf=0;
+    $decor="\$L\$";
+}
+
+my $current_segment;
+my $current_function;
+my %globals;
+
+{ package opcode;      # pick up opcodes
+    sub re {
+       my      $self = shift;  # single instance in enough...
+       local   *line = shift;
+       undef   $ret;
+
+       if ($line =~ /^([a-z][a-z0-9]*)/i) {
+           $self->{op} = $1;
+           $ret = $self;
+           $line = substr($line,@+[0]); $line =~ s/^\s+//;
+
+           undef $self->{sz};
+           if ($self->{op} =~ /^(movz)x?([bw]).*/) {   # movz is pain...
+               $self->{op} = $1;
+               $self->{sz} = $2;
+           } elsif ($self->{op} =~ /call|jmp/) {
+               $self->{sz} = "";
+           } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn
+               $self->{sz} = "";
+           } elsif ($self->{op} =~ /^v/) { # VEX
+               $self->{sz} = "";
+           } elsif ($self->{op} =~ /movq/ && $line =~ /%xmm/) {
+               $self->{sz} = "";
+           } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) {
+               $self->{op} = $1;
+               $self->{sz} = $2;
+           }
+       }
+       $ret;
+    }
+    sub size {
+       my $self = shift;
+       my $sz   = shift;
+       $self->{sz} = $sz if (defined($sz) && !defined($self->{sz}));
+       $self->{sz};
+    }
+    sub out {
+       my $self = shift;
+       if ($gas) {
+           if ($self->{op} eq "movz") {        # movz is pain...
+               sprintf "%s%s%s",$self->{op},$self->{sz},shift;
+           } elsif ($self->{op} =~ /^set/) { 
+               "$self->{op}";
+           } elsif ($self->{op} eq "ret") {
+               my $epilogue = "";
+               if ($win64 && $current_function->{abi} eq "svr4") {
+                   $epilogue = "movq   8(%rsp),%rdi\n\t" .
+                               "movq   16(%rsp),%rsi\n\t";
+               }
+               $epilogue . ".byte      0xf3,0xc3";
+           } elsif ($self->{op} eq "call" && !$elf && $current_segment eq 
".init") {
+               ".p2align\t3\n\t.quad";
+           } else {
+               "$self->{op}$self->{sz}";
+           }
+       } else {
+           $self->{op} =~ s/^movz/movzx/;
+           if ($self->{op} eq "ret") {
+               $self->{op} = "";
+               if ($win64 && $current_function->{abi} eq "svr4") {
+                   $self->{op} = "mov  rdi,QWORD${PTR}[8+rsp]\t;WIN64 
epilogue\n\t".
+                                 "mov  rsi,QWORD${PTR}[16+rsp]\n\t";
+               }
+               $self->{op} .= "DB\t0F3h,0C3h\t\t;repret";
+           } elsif ($self->{op} =~ /^(pop|push)f/) {
+               $self->{op} .= $self->{sz};
+           } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") {
+               $self->{op} = "\tDQ";
+           } 
+           $self->{op};
+       }
+    }
+    sub mnemonic {
+       my $self=shift;
+       my $op=shift;
+       $self->{op}=$op if (defined($op));
+       $self->{op};
+    }
+}
+{ package const;       # pick up constants, which start with $
+    sub re {
+       my      $self = shift;  # single instance in enough...
+       local   *line = shift;
+       undef   $ret;
+
+       if ($line =~ /^\$([^,]+)/) {
+           $self->{value} = $1;
+           $ret = $self;
+           $line = substr($line,@+[0]); $line =~ s/^\s+//;
+       }
+       $ret;
+    }
+    sub out {
+       my $self = shift;
+
+       if ($gas) {
+           # Solaris /usr/ccs/bin/as can't handle multiplications
+           # in $self->{value}
+           $self->{value} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+           $self->{value} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg;
+           sprintf "\$%s",$self->{value};
+       } else {
+           $self->{value} =~ s/(0b[0-1]+)/oct($1)/eig;
+           $self->{value} =~ s/0x([0-9a-f]+)/0$1h/ig if ($masm);
+           sprintf "%s",$self->{value};
+       }
+    }
+}
+{ package ea;          # pick up effective addresses: expr(%reg,%reg,scale)
+    sub re {
+       my      $self = shift;  # single instance in enough...
+       local   *line = shift;
+       undef   $ret;
+
+       # optional * ---vvv--- appears in indirect jmp/call
+       if ($line =~ /^(\*?)([^\(,]*)\(([%\w,]+)\)/) {
+           $self->{asterisk} = $1;
+           $self->{label} = $2;
+           ($self->{base},$self->{index},$self->{scale})=split(/,/,$3);
+           $self->{scale} = 1 if (!defined($self->{scale}));
+           $ret = $self;
+           $line = substr($line,@+[0]); $line =~ s/^\s+//;
+
+           if ($win64 && $self->{label} =~ s/address@hidden//) {
+               die if (opcode->mnemonic() ne "mov");
+               opcode->mnemonic("lea");
+           }
+           $self->{base}  =~ s/^%//;
+           $self->{index} =~ s/^%// if (defined($self->{index}));
+       }
+       $ret;
+    }
+    sub size {}
+    sub out {
+       my $self = shift;
+       my $sz = shift;
+
+       $self->{label} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
+       $self->{label} =~ s/\.L/$decor/g;
+
+       # Silently convert all EAs to 64-bit. This is required for
+       # elder GNU assembler and results in more compact code,
+       # *but* most importantly AES module depends on this feature!
+       $self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
+       $self->{base}  =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
+
+       # Solaris /usr/ccs/bin/as can't handle multiplications
+       # in $self->{label}, new gas requires sign extension...
+       use integer;
+       $self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+       $self->{label} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg;
+       $self->{label} =~ s/([0-9]+)/$1<<32>>32/eg;
+
+       if ($gas) {
+           $self->{label} =~ s/^___imp_/__imp__/   if ($flavour eq "mingw64");
+
+           if (defined($self->{index})) {
+               sprintf "%s%s(%s,%%%s,%d)",$self->{asterisk},
+                                       $self->{label},
+                                       $self->{base}?"%$self->{base}":"",
+                                       $self->{index},$self->{scale};
+           } else {
+               sprintf "%s%s(%%%s)",   
$self->{asterisk},$self->{label},$self->{base};
+           }
+       } else {
+           %szmap = (  b=>"BYTE$PTR", w=>"WORD$PTR", l=>"DWORD$PTR",
+                       q=>"QWORD$PTR",o=>"OWORD$PTR",x=>"XMMWORD$PTR" );
+
+           $self->{label} =~ s/\./\$/g;
+           $self->{label} =~ s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/ig;
+           $self->{label} = "($self->{label})" if ($self->{label} =~ 
/[\*\+\-\/]/);
+           $sz="q" if ($self->{asterisk} || opcode->mnemonic() eq "movq");
+           $sz="l" if (opcode->mnemonic() eq "movd");
+
+           if (defined($self->{index})) {
+               sprintf "%s[%s%s*%d%s]",$szmap{$sz},
+                                       $self->{label}?"$self->{label}+":"",
+                                       $self->{index},$self->{scale},
+                                       $self->{base}?"+$self->{base}":"";
+           } elsif ($self->{base} eq "rip") {
+               sprintf "%s[%s]",$szmap{$sz},$self->{label};
+           } else {
+               sprintf "%s[%s%s]",$szmap{$sz},
+                                       $self->{label}?"$self->{label}+":"",
+                                       $self->{base};
+           }
+       }
+    }
+}
+{ package register;    # pick up registers, which start with %.
+    sub re {
+       my      $class = shift; # muliple instances...
+       my      $self = {};
+       local   *line = shift;
+       undef   $ret;
+
+       # optional * ---vvv--- appears in indirect jmp/call
+       if ($line =~ /^(\*?)%(\w+)/) {
+           bless $self,$class;
+           $self->{asterisk} = $1;
+           $self->{value} = $2;
+           $ret = $self;
+           $line = substr($line,@+[0]); $line =~ s/^\s+//;
+       }
+       $ret;
+    }
+    sub size {
+       my      $self = shift;
+       undef   $ret;
+
+       if    ($self->{value} =~ /^r[\d]+b$/i)  { $ret="b"; }
+       elsif ($self->{value} =~ /^r[\d]+w$/i)  { $ret="w"; }
+       elsif ($self->{value} =~ /^r[\d]+d$/i)  { $ret="l"; }
+       elsif ($self->{value} =~ /^r[\w]+$/i)   { $ret="q"; }
+       elsif ($self->{value} =~ /^[a-d][hl]$/i){ $ret="b"; }
+       elsif ($self->{value} =~ /^[\w]{2}l$/i) { $ret="b"; }
+       elsif ($self->{value} =~ /^[\w]{2}$/i)  { $ret="w"; }
+       elsif ($self->{value} =~ /^e[a-z]{2}$/i){ $ret="l"; }
+
+       $ret;
+    }
+    sub out {
+       my $self = shift;
+       if ($gas)       { sprintf "%s%%%s",$self->{asterisk},$self->{value}; }
+       else            { $self->{value}; }
+    }
+}
+{ package label;       # pick up labels, which end with :
+    sub re {
+       my      $self = shift;  # single instance is enough...
+       local   *line = shift;
+       undef   $ret;
+
+       if ($line =~ /(^[\.\w]+)\:/) {
+           $self->{value} = $1;
+           $ret = $self;
+           $line = substr($line,@+[0]); $line =~ s/^\s+//;
+
+           $self->{value} =~ s/^\.L/$decor/;
+       }
+       $ret;
+    }
+    sub out {
+       my $self = shift;
+
+       if ($gas) {
+           my $func = ($globals{$self->{value}} or $self->{value}) . ":";
+           if ($win64  &&
+                       $current_function->{name} eq $self->{value} &&
+                       $current_function->{abi} eq "svr4") {
+               $func .= "\n";
+               $func .= "      movq    %rdi,8(%rsp)\n";
+               $func .= "      movq    %rsi,16(%rsp)\n";
+               $func .= "      movq    %rsp,%rax\n";
+               $func .= "${decor}SEH_begin_$current_function->{name}:\n";
+               my $narg = $current_function->{narg};
+               $narg=6 if (!defined($narg));
+               $func .= "      movq    %rcx,%rdi\n" if ($narg>0);
+               $func .= "      movq    %rdx,%rsi\n" if ($narg>1);
+               $func .= "      movq    %r8,%rdx\n"  if ($narg>2);
+               $func .= "      movq    %r9,%rcx\n"  if ($narg>3);
+               $func .= "      movq    40(%rsp),%r8\n" if ($narg>4);
+               $func .= "      movq    48(%rsp),%r9\n" if ($narg>5);
+           }
+           $func;
+       } elsif ($self->{value} ne "$current_function->{name}") {
+           $self->{value} .= ":" if ($masm && $ret!~m/^\$/);
+           $self->{value} . ":";
+       } elsif ($win64 && $current_function->{abi} eq "svr4") {
+           my $func =  "$current_function->{name}" .
+                       ($nasm ? ":" : "\tPROC $current_function->{scope}") .
+                       "\n";
+           $func .= "  mov     QWORD${PTR}[8+rsp],rdi\t;WIN64 prologue\n";
+           $func .= "  mov     QWORD${PTR}[16+rsp],rsi\n";
+           $func .= "  mov     rax,rsp\n";
+           $func .= "${decor}SEH_begin_$current_function->{name}:";
+           $func .= ":" if ($masm);
+           $func .= "\n";
+           my $narg = $current_function->{narg};
+           $narg=6 if (!defined($narg));
+           $func .= "  mov     rdi,rcx\n" if ($narg>0);
+           $func .= "  mov     rsi,rdx\n" if ($narg>1);
+           $func .= "  mov     rdx,r8\n"  if ($narg>2);
+           $func .= "  mov     rcx,r9\n"  if ($narg>3);
+           $func .= "  mov     r8,QWORD${PTR}[40+rsp]\n" if ($narg>4);
+           $func .= "  mov     r9,QWORD${PTR}[48+rsp]\n" if ($narg>5);
+           $func .= "\n";
+       } else {
+          "$current_function->{name}".
+                       ($nasm ? ":" : "\tPROC $current_function->{scope}");
+       }
+    }
+}
+{ package expr;                # pick up expressioins
+    sub re {
+       my      $self = shift;  # single instance is enough...
+       local   *line = shift;
+       undef   $ret;
+
+       if ($line =~ /(^[^,]+)/) {
+           $self->{value} = $1;
+           $ret = $self;
+           $line = substr($line,@+[0]); $line =~ s/^\s+//;
+
+           $self->{value} =~ s/address@hidden// if (!$elf);
+           $self->{value} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
+           $self->{value} =~ s/\.L/$decor/g;
+       }
+       $ret;
+    }
+    sub out {
+       my $self = shift;
+       if ($nasm && opcode->mnemonic()=~m/^j/) {
+           "NEAR ".$self->{value};
+       } else {
+           $self->{value};
+       }
+    }
+}
+{ package directive;   # pick up directives, which start with .
+    sub re {
+       my      $self = shift;  # single instance is enough...
+       local   *line = shift;
+       undef   $ret;
+       my      $dir;
+       my      %opcode =       # lea 2f-1f(%rip),%dst; 1: nop; 2:
+               (       "%rax"=>0x01058d48,     "%rcx"=>0x010d8d48,
+                       "%rdx"=>0x01158d48,     "%rbx"=>0x011d8d48,
+                       "%rsp"=>0x01258d48,     "%rbp"=>0x012d8d48,
+                       "%rsi"=>0x01358d48,     "%rdi"=>0x013d8d48,
+                       "%r8" =>0x01058d4c,     "%r9" =>0x010d8d4c,
+                       "%r10"=>0x01158d4c,     "%r11"=>0x011d8d4c,
+                       "%r12"=>0x01258d4c,     "%r13"=>0x012d8d4c,
+                       "%r14"=>0x01358d4c,     "%r15"=>0x013d8d4c      );
+
+       if ($line =~ /^\s*(\.\w+)/) {
+           $dir = $1;
+           $ret = $self;
+           undef $self->{value};
+           $line = substr($line,@+[0]); $line =~ s/^\s+//;
+
+           SWITCH: for ($dir) {
+               /\.picmeup/ && do { if ($line =~ /(%r[\w]+)/i) {
+                                       $dir="\t.long";
+                                       $line=sprintf 
"0x%x,0x90000000",$opcode{$1};
+                                   }
+                                   last;
+                                 };
+               /\.global|\.globl|\.extern/
+                           && do { $globals{$line} = $prefix . $line;
+                                   $line = $globals{$line} if ($prefix);
+                                   last;
+                                 };
+               /\.type/    && do { ($sym,$type,$narg) = split(',',$line);
+                                   if ($type eq "address@hidden") {
+                                       undef $current_function;
+                                       $current_function->{name} = $sym;
+                                       $current_function->{abi}  = "svr4";
+                                       $current_function->{narg} = $narg;
+                                       $current_function->{scope} = 
defined($globals{$sym})?"PUBLIC":"PRIVATE";
+                                   } elsif ($type eq "address@hidden") {
+                                       undef $current_function;
+                                       $current_function->{name} = $sym;
+                                       $current_function->{scope} = 
defined($globals{$sym})?"PUBLIC":"PRIVATE";
+                                   }
+                                   $line =~ s/address@hidden/address@hidden/;
+                                   $line =~ s/address@hidden/address@hidden/;
+                                   last;
+                                 };
+               /\.asciz/   && do { if ($line =~ /^"(.*)"$/) {
+                                       $dir  = ".byte";
+                                       $line = join(",",unpack("C*",$1),0);
+                                   }
+                                   last;
+                                 };
+               /\.rva|\.long|\.quad/
+                           && do { $line =~ s/([_a-z][_a-z0-9]*)/$globals{$1} 
or $1/gei;
+                                   $line =~ s/\.L/$decor/g;
+                                   last;
+                                 };
+           }
+
+           if ($gas) {
+               $self->{value} = $dir . "\t" . $line;
+
+               if ($dir =~ /\.extern/) {
+                   $self->{value} = ""; # swallow extern
+               } elsif (!$elf && $dir =~ /\.type/) {
+                   $self->{value} = "";
+                   $self->{value} = ".def\t" . ($globals{$1} or $1) . ";\t" .
+                               (defined($globals{$1})?".scl 2;":".scl 3;") .
+                               "\t.type 32;\t.endef"
+                               if ($win64 && $line =~ 
/([^,]+),address@hidden/);
+               } elsif (!$elf && $dir =~ /\.size/) {
+                   $self->{value} = "";
+                   if (defined($current_function)) {
+                       $self->{value} .= 
"${decor}SEH_end_$current_function->{name}:"
+                               if ($win64 && $current_function->{abi} eq 
"svr4");
+                       undef $current_function;
+                   }
+               } elsif (!$elf && $dir =~ /\.align/) {
+                   $self->{value} = ".p2align\t" . (log($line)/log(2));
+               } elsif ($dir eq ".section") {
+                   $current_segment=$line;
+                   if (!$elf && $current_segment eq ".init") {
+                       if      ($flavour eq "macosx")  { $self->{value} = 
".mod_init_func"; }
+                       elsif   ($flavour eq "mingw64") { $self->{value} = 
".section\t.ctors"; }
+                   }
+               } elsif ($dir =~ /\.(text|data)/) {
+                   $current_segment=".$1";
+               } elsif ($dir =~ /\.hidden/) {
+                   if    ($flavour eq "macosx")  { $self->{value} = 
".private_extern\t$prefix$line"; }
+                   elsif ($flavour eq "mingw64") { $self->{value} = ""; }
+               } elsif ($dir =~ /\.comm/) {
+                   $self->{value} = "$dir\t$prefix$line";
+                   $self->{value} =~ 
s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx");
+               }
+               $line = "";
+               return $self;
+           }
+
+           # non-gas case or nasm/masm
+           SWITCH: for ($dir) {
+               /\.text/    && do { my $v=undef;
+                                   if ($nasm) {
+                                       $v="section     .text code align=64\n";
+                                   } else {
+                                       $v="$current_segment\tENDS\n" if 
($current_segment);
+                                       $current_segment = ".text\$";
+                                       $v.="$current_segment\tSEGMENT ";
+                                       $v.=$masm>=$masmref ? "ALIGN(64)" : 
"PAGE";
+                                       $v.=" 'CODE'";
+                                   }
+                                   $self->{value} = $v;
+                                   last;
+                                 };
+               /\.data/    && do { my $v=undef;
+                                   if ($nasm) {
+                                       $v="section     .data data align=8\n";
+                                   } else {
+                                       $v="$current_segment\tENDS\n" if 
($current_segment);
+                                       $current_segment = "_DATA";
+                                       $v.="$current_segment\tSEGMENT";
+                                   }
+                                   $self->{value} = $v;
+                                   last;
+                                 };
+               /\.section/ && do { my $v=undef;
+                                   $line =~ s/([^,]*).*/$1/;
+                                   $line = ".CRT\$XCU" if ($line eq ".init");
+                                   if ($nasm) {
+                                       $v="section     $line";
+                                       if ($line=~/\.([px])data/) {
+                                           $v.=" rdata align=";
+                                           $v.=$1 eq "p"? 4 : 8;
+                                       } elsif ($line=~/\.CRT\$/i) {
+                                           $v.=" rdata align=8";
+                                       }
+                                   } else {
+                                       $v="$current_segment\tENDS\n" if 
($current_segment);
+                                       $v.="$line\tSEGMENT";
+                                       if ($line=~/\.([px])data/) {
+                                           $v.=" READONLY";
+                                           $v.=" ALIGN(".($1 eq "p" ? 4 : 
8).")" if ($masm>=$masmref);
+                                       } elsif ($line=~/\.CRT\$/i) {
+                                           $v.=" READONLY ALIGN(8)";
+                                       }
+                                   }
+                                   $current_segment = $line;
+                                   $self->{value} = $v;
+                                   last;
+                                 };
+               /\.extern/  && do { $self->{value}  = "EXTERN\t".$line;
+                                   $self->{value} .= ":NEAR" if ($masm);
+                                   last;
+                                 };
+               /\.globl|.global/
+                           && do { $self->{value}  = $masm?"PUBLIC":"global";
+                                   $self->{value} .= "\t".$line;
+                                   last;
+                                 };
+               /\.size/    && do { if (defined($current_function)) {
+                                       undef $self->{value};
+                                       if ($current_function->{abi} eq "svr4") 
{
+                                           
$self->{value}="${decor}SEH_end_$current_function->{name}:";
+                                           $self->{value}.=":\n" if($masm);
+                                       }
+                                       
$self->{value}.="$current_function->{name}\tENDP" if($masm && 
$current_function->{name});
+                                       undef $current_function;
+                                   }
+                                   last;
+                                 };
+               /\.align/   && do { $self->{value} = "ALIGN\t".$line; last; };
+               /\.(value|long|rva|quad)/
+                           && do { my $sz  = substr($1,0,1);
+                                   my @arr = split(/,\s*/,$line);
+                                   my $last = pop(@arr);
+                                   my $conv = sub  {   my $var=shift;
+                                                       
$var=~s/^(0b[0-1]+)/oct($1)/eig;
+                                                       
$var=~s/^0x([0-9a-f]+)/0$1h/ig if ($masm);
+                                                       if ($sz eq "D" && 
($current_segment=~/.[px]data/ || $dir eq ".rva"))
+                                                       { 
$var=~s/(address@hidden@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; }
+                                                       $var;
+                                                   };  
+
+                                   $sz =~ tr/bvlrq/BWDDQ/;
+                                   $self->{value} = "\tD$sz\t";
+                                   for (@arr) { $self->{value} .= 
&$conv($_).","; }
+                                   $self->{value} .= &$conv($last);
+                                   last;
+                                 };
+               /\.byte/    && do { my @str=split(/,\s*/,$line);
+                                   map(s/(0b[0-1]+)/oct($1)/eig,@str);
+                                   map(s/0x([0-9a-f]+)/0$1h/ig,@str) if 
($masm);       
+                                   while ($#str>15) {
+                                       $self->{value}.="DB\t"
+                                               .join(",",@str[0..15])."\n";
+                                       foreach (0..15) { shift @str; }
+                                   }
+                                   $self->{value}.="DB\t"
+                                               .join(",",@str) if (@str);
+                                   last;
+                                 };
+               /\.comm/    && do { my @str=split(/,\s*/,$line);
+                                   my $v=undef;
+                                   if ($nasm) {
+                                       $v.="common     address@hidden @str[1]";
+                                   } else {
+                                       $v="$current_segment\tENDS\n" if 
($current_segment);
+                                       $current_segment = "_DATA";
+                                       $v.="$current_segment\tSEGMENT\n";
+                                       $v.="COMM       
@str[0]:DWORD:"address@hidden/4;
+                                   }
+                                   $self->{value} = $v;
+                                   last;
+                                 };
+           }
+           $line = "";
+       }
+
+       $ret;
+    }
+    sub out {
+       my $self = shift;
+       $self->{value};
+    }
+}
+
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src,$rex)address@hidden;
+
+   $rex|=0x04 if($dst>=8);
+   $rex|=0x01 if($src>=8);
+   push @opcode,($rex|0x40) if ($rex);
+}
+
+# older gas and ml64 don't handle SSE>2 instructions
+my %regrm = (  "%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3,
+               "%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7      );
+
+my $movq = sub {       # elderly gas can't handle inter-register movq
+  my $arg = shift;
+  my @opcode=(0x66);
+    if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) {
+       my ($src,$dst)=($1,$2);
+       if ($dst !~ /[0-9]+/)   { $dst = $regrm{"%e$dst"}; }
+       rex(address@hidden,$src,$dst,0x8);
+       push @opcode,0x0f,0x7e;
+       push @opcode,0xc0|(($src&7)<<3)|($dst&7);       # ModR/M
+       @opcode;
+    } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) {
+       my ($src,$dst)=($2,$1);
+       if ($dst !~ /[0-9]+/)   { $dst = $regrm{"%e$dst"}; }
+       rex(address@hidden,$src,$dst,0x8);
+       push @opcode,0x0f,0x6e;
+       push @opcode,0xc0|(($src&7)<<3)|($dst&7);       # ModR/M
+       @opcode;
+    } else {
+       ();
+    }
+};
+
+my $pextrd = sub {
+    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) {
+      my @opcode=(0x66);
+       $imm=$1;
+       $src=$2;
+       $dst=$3;
+       if ($dst =~ /%r([0-9]+)d/)      { $dst = $1; }
+       elsif ($dst =~ /%e/)            { $dst = $regrm{$dst}; }
+       rex(address@hidden,$src,$dst);
+       push @opcode,0x0f,0x3a,0x16;
+       push @opcode,0xc0|(($src&7)<<3)|($dst&7);       # ModR/M
+       push @opcode,$imm;
+       @opcode;
+    } else {
+       ();
+    }
+};
+
+my $pinsrd = sub {
+    if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+       $imm=$1;
+       $src=$2;
+       $dst=$3;
+       if ($src =~ /%r([0-9]+)/)       { $src = $1; }
+       elsif ($src =~ /%e/)            { $src = $regrm{$src}; }
+       rex(address@hidden,$dst,$src);
+       push @opcode,0x0f,0x3a,0x22;
+       push @opcode,0xc0|(($dst&7)<<3)|($src&7);       # ModR/M
+       push @opcode,$imm;
+       @opcode;
+    } else {
+       ();
+    }
+};
+
+my $pshufb = sub {
+    if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+       rex(address@hidden,$2,$1);
+       push @opcode,0x0f,0x38,0x00;
+       push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
+       @opcode;
+    } else {
+       ();
+    }
+};
+
+my $palignr = sub {
+    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+       rex(address@hidden,$3,$2);
+       push @opcode,0x0f,0x3a,0x0f;
+       push @opcode,0xc0|($2&7)|(($3&7)<<3);           # ModR/M
+       push @opcode,$1;
+       @opcode;
+    } else {
+       ();
+    }
+};
+
+my $pclmulqdq = sub {
+    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+       rex(address@hidden,$3,$2);
+       push @opcode,0x0f,0x3a,0x44;
+       push @opcode,0xc0|($2&7)|(($3&7)<<3);           # ModR/M
+       my $c=$1;
+       push @opcode,$c=~/^0/?oct($c):$c;
+       @opcode;
+    } else {
+       ();
+    }
+};
+
+my $rdrand = sub {
+    if (shift =~ /%[er](\w+)/) {
+      my @opcode=();
+      my $dst=$1;
+       if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+       rex(address@hidden,0,$1,8);
+       push @opcode,0x0f,0xc7,0xf0|($dst&7);
+       @opcode;
+    } else {
+       ();
+    }
+};
+
+if ($nasm) {
+    print <<___;
+default        rel
+%define XMMWORD
+___
+} elsif ($masm) {
+    print <<___;
+OPTION DOTNAME
+___
+}
+while($line=<>) {
+
+    chomp($line);
+
+    $line =~ s|[#!].*$||;      # get rid of asm-style comments...
+    $line =~ s|/\*.*\*/||;     # ... and C-style comments...
+    $line =~ s|^\s+||;         # ... and skip white spaces in beginning
+
+    undef $label;
+    undef $opcode;
+    undef @args;
+
+    if ($label=label->re(\$line))      { print $label->out(); }
+
+    if (directive->re(\$line)) {
+       printf "%s",directive->out();
+    } elsif ($opcode=opcode->re(\$line)) {
+       my $asm = eval("\$".$opcode->mnemonic());
+       undef @bytes;
+       
+       if ((ref($asm) eq 'CODE') && scalar(@bytes=&$asm($line))) {
+           print $gas?".byte\t":"DB\t",join(',',@bytes),"\n";
+           next;
+       }
+
+       ARGUMENT: while (1) {
+       my $arg;
+
+       if ($arg=register->re(\$line))  { opcode->size($arg->size()); }
+       elsif ($arg=const->re(\$line))  { }
+       elsif ($arg=ea->re(\$line))     { }
+       elsif ($arg=expr->re(\$line))   { }
+       else                            { last ARGUMENT; }
+
+       push @args,$arg;
+
+       last ARGUMENT if ($line !~ /^,/);
+
+       $line =~ s/^,\s*//;
+       } # ARGUMENT:
+
+       if ($#args>=0) {
+           my $insn;
+           my $sz=opcode->size();
+
+           if ($gas) {
+               $insn = $opcode->out($#args>=1?$args[$#args]->size():$sz);
+               @args = map($_->out($sz),@args);
+               printf "\t%s\t%s",$insn,join(",",@args);
+           } else {
+               $insn = $opcode->out();
+               foreach (@args) {
+                   my $arg = $_->out();
+                   # $insn.=$sz compensates for movq, pinsrw, ...
+                   if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); 
last; }
+                   if ($arg =~ /^mm[0-9]+$/)  { $insn.=$sz; $sz="q" if(!$sz); 
last; }
+               }
+               @args = reverse(@args);
+               undef $sz if ($nasm && $opcode->mnemonic() eq "lea");
+               printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args));
+           }
+       } else {
+           printf "\t%s",$opcode->out();
+       }
+    }
+
+    print $line,"\n";
+}
+
+print "\n$current_segment\tENDS\n"     if ($current_segment && $masm);
+print "END\n"                          if ($masm);
+
+close STDOUT;
+
+#################################################
+# Cross-reference x86_64 ABI "card"
+#
+#              Unix            Win64
+# %rax         *               *
+# %rbx         -               -
+# %rcx         #4              #1
+# %rdx         #3              #2
+# %rsi         #2              -
+# %rdi         #1              -
+# %rbp         -               -
+# %rsp         -               -
+# %r8          #5              #3
+# %r9          #6              #4
+# %r10         *               *
+# %r11         *               *
+# %r12         -               -
+# %r13         -               -
+# %r14         -               -
+# %r15         -               -
+# 
+# (*)  volatile register
+# (-)  preserved by callee
+# (#)  Nth argument, volatile
+#
+# In Unix terms top of stack is argument transfer area for arguments
+# which could not be accomodated in registers. Or in other words 7th
+# [integer] argument resides at 8(%rsp) upon function entry point.
+# 128 bytes above %rsp constitute a "red zone" which is not touched
+# by signal handlers and can be used as temporal storage without
+# allocating a frame.
+#
+# In Win64 terms N*8 bytes on top of stack is argument transfer area,
+# which belongs to/can be overwritten by callee. N is the number of
+# arguments passed to callee, *but* not less than 4! This means that
+# upon function entry point 5th argument resides at 40(%rsp), as well
+# as that 32 bytes from 8(%rsp) can always be used as temporal
+# storage [without allocating a frame]. One can actually argue that
+# one can assume a "red zone" above stack pointer under Win64 as well.
+# Point is that at apparently no occasion Windows kernel would alter
+# the area above user stack pointer in true asynchronous manner...
+#
+# All the above means that if assembler programmer adheres to Unix
+# register and stack layout, but disregards the "red zone" existense,
+# it's possible to use following prologue and epilogue to "gear" from
+# Unix to Win64 ABI in leaf functions with not more than 6 arguments.
+#
+# omnipotent_function:
+# ifdef WIN64
+#      movq    %rdi,8(%rsp)
+#      movq    %rsi,16(%rsp)
+#      movq    %rcx,%rdi       ; if 1st argument is actually present
+#      movq    %rdx,%rsi       ; if 2nd argument is actually ...
+#      movq    %r8,%rdx        ; if 3rd argument is ...
+#      movq    %r9,%rcx        ; if 4th argument ...
+#      movq    40(%rsp),%r8    ; if 5th ...
+#      movq    48(%rsp),%r9    ; if 6th ...
+# endif
+#      ...
+# ifdef WIN64
+#      movq    8(%rsp),%rdi
+#      movq    16(%rsp),%rsi
+# endif
+#      ret
+#
+#################################################
+# Win64 SEH, Structured Exception Handling.
+#
+# Unlike on Unix systems(*) lack of Win64 stack unwinding information
+# has undesired side-effect at run-time: if an exception is raised in
+# assembler subroutine such as those in question (basically we're
+# referring to segmentation violations caused by malformed input
+# parameters), the application is briskly terminated without invoking
+# any exception handlers, most notably without generating memory dump
+# or any user notification whatsoever. This poses a problem. It's
+# possible to address it by registering custom language-specific
+# handler that would restore processor context to the state at
+# subroutine entry point and return "exception is not handled, keep
+# unwinding" code. Writing such handler can be a challenge... But it's
+# doable, though requires certain coding convention. Consider following
+# snippet:
+#
+# .type        function,@function
+# function:
+#      movq    %rsp,%rax       # copy rsp to volatile register
+#      pushq   %r15            # save non-volatile registers
+#      pushq   %rbx
+#      pushq   %rbp
+#      movq    %rsp,%r11
+#      subq    %rdi,%r11       # prepare [variable] stack frame
+#      andq    $-64,%r11
+#      movq    %rax,0(%r11)    # check for exceptions
+#      movq    %r11,%rsp       # allocate [variable] stack frame
+#      movq    %rax,0(%rsp)    # save original rsp value
+# magic_point:
+#      ...
+#      movq    0(%rsp),%rcx    # pull original rsp value
+#      movq    -24(%rcx),%rbp  # restore non-volatile registers
+#      movq    -16(%rcx),%rbx
+#      movq    -8(%rcx),%r15
+#      movq    %rcx,%rsp       # restore original rsp
+#      ret
+# .size function,.-function
+#
+# The key is that up to magic_point copy of original rsp value remains
+# in chosen volatile register and no non-volatile register, except for
+# rsp, is modified. While past magic_point rsp remains constant till
+# the very end of the function. In this case custom language-specific
+# exception handler would look like this:
+#
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#              CONTEXT *context,DISPATCHER_CONTEXT *disp)
+# {    ULONG64 *rsp = (ULONG64 *)context->Rax;
+#      if (context->Rip >= magic_point)
+#      {   rsp = ((ULONG64 **)context->Rsp)[0];
+#          context->Rbp = rsp[-3];
+#          context->Rbx = rsp[-2];
+#          context->R15 = rsp[-1];
+#      }
+#      context->Rsp = (ULONG64)rsp;
+#      context->Rdi = rsp[1];
+#      context->Rsi = rsp[2];
+#
+#      memcpy (disp->ContextRecord,context,sizeof(CONTEXT));
+#      RtlVirtualUnwind(UNW_FLAG_NHANDLER,disp->ImageBase,
+#              dips->ControlPc,disp->FunctionEntry,disp->ContextRecord,
+#              &disp->HandlerData,&disp->EstablisherFrame,NULL);
+#      return ExceptionContinueSearch;
+# }
+#
+# It's appropriate to implement this handler in assembler, directly in
+# function's module. In order to do that one has to know members'
+# offsets in CONTEXT and DISPATCHER_CONTEXT structures and some constant
+# values. Here they are:
+#
+#      CONTEXT.Rax                             120
+#      CONTEXT.Rcx                             128
+#      CONTEXT.Rdx                             136
+#      CONTEXT.Rbx                             144
+#      CONTEXT.Rsp                             152
+#      CONTEXT.Rbp                             160
+#      CONTEXT.Rsi                             168
+#      CONTEXT.Rdi                             176
+#      CONTEXT.R8                              184
+#      CONTEXT.R9                              192
+#      CONTEXT.R10                             200
+#      CONTEXT.R11                             208
+#      CONTEXT.R12                             216
+#      CONTEXT.R13                             224
+#      CONTEXT.R14                             232
+#      CONTEXT.R15                             240
+#      CONTEXT.Rip                             248
+#      CONTEXT.Xmm6                            512
+#      sizeof(CONTEXT)                         1232
+#      DISPATCHER_CONTEXT.ControlPc            0
+#      DISPATCHER_CONTEXT.ImageBase            8
+#      DISPATCHER_CONTEXT.FunctionEntry        16
+#      DISPATCHER_CONTEXT.EstablisherFrame     24
+#      DISPATCHER_CONTEXT.TargetIp             32
+#      DISPATCHER_CONTEXT.ContextRecord        40
+#      DISPATCHER_CONTEXT.LanguageHandler      48
+#      DISPATCHER_CONTEXT.HandlerData          56
+#      UNW_FLAG_NHANDLER                       0
+#      ExceptionContinueSearch                 1
+#
+# In order to tie the handler to the function one has to compose
+# couple of structures: one for .xdata segment and one for .pdata.
+#
+# UNWIND_INFO structure for .xdata segment would be
+#
+# function_unwind_info:
+#      .byte   9,0,0,0
+#      .rva    handler
+#
+# This structure designates exception handler for a function with
+# zero-length prologue, no stack frame or frame register.
+#
+# To facilitate composing of .pdata structures, auto-generated "gear"
+# prologue copies rsp value to rax and denotes next instruction with
+# .LSEH_begin_{function_name} label. This essentially defines the SEH
+# styling rule mentioned in the beginning. Position of this label is
+# chosen in such manner that possible exceptions raised in the "gear"
+# prologue would be accounted to caller and unwound from latter's frame.
+# End of function is marked with respective .LSEH_end_{function_name}
+# label. To summarize, .pdata segment would contain
+#
+#      .rva    .LSEH_begin_function
+#      .rva    .LSEH_end_function
+#      .rva    function_unwind_info
+#
+# Reference to functon_unwind_info from .xdata segment is the anchor.
+# In case you wonder why references are 32-bit .rvas and not 64-bit
+# .quads. References put into these two segments are required to be
+# *relative* to the base address of the current binary module, a.k.a.
+# image base. No Win64 module, be it .exe or .dll, can be larger than
+# 2GB and thus such relative references can be and are accommodated in
+# 32 bits.
+#
+# Having reviewed the example function code, one can argue that "movq
+# %rsp,%rax" above is redundant. It is not! Keep in mind that on Unix
+# rax would contain an undefined value. If this "offends" you, use
+# another register and refrain from modifying rax till magic_point is
+# reached, i.e. as if it was a non-volatile register. If more registers
+# are required prior [variable] frame setup is completed, note that
+# nobody says that you can have only one "magic point." You can
+# "liberate" non-volatile registers by denoting last stack off-load
+# instruction and reflecting it in finer grade unwind logic in handler.
+# After all, isn't it why it's called *language-specific* handler...
+#
+# Attentive reader can notice that exceptions would be mishandled in
+# auto-generated "gear" epilogue. Well, exception effectively can't
+# occur there, because if memory area used by it was subject to
+# segmentation violation, then it would be raised upon call to the
+# function (and as already mentioned be accounted to caller, which is
+# not a problem). If you're still not comfortable, then define tail
+# "magic point" just prior ret instruction and have handler treat it...
+#
+# (*)  Note that we're talking about run-time, not debug-time. Lack of
+#      unwind information makes debugging hard on both Windows and
+#      Unix. "Unlike" referes to the fact that on Unix signal handler
+#      will always be invoked, core dumped and appropriate exit code
+#      returned to parent (for user notification).
diff --git a/devel/perlasm/x86asm.pl b/devel/perlasm/x86asm.pl
new file mode 100644
index 0000000..eb543db
--- /dev/null
+++ b/devel/perlasm/x86asm.pl
@@ -0,0 +1,260 @@
+#!/usr/bin/env perl
+
+# require 'x86asm.pl';
+# &asm_init(<flavor>,"des-586.pl"[,$i386only]);
+# &function_begin("foo");
+# ...
+# &function_end("foo");
+# &asm_finish
+
+$out=();
+$i386=0;
+
+# AUTOLOAD is this context has quite unpleasant side effect, namely
+# that typos in function calls effectively go to assembler output,
+# but on the pros side we don't have to implement one subroutine per
+# each opcode...
+sub ::AUTOLOAD
+{ my $opcode = $AUTOLOAD;
+
+    die "more than 4 arguments passed to $opcode" if ($#_>3);
+
+    $opcode =~ s/.*:://;
+    if    ($opcode =~ /^push/) { $stack+=4; }
+    elsif ($opcode =~ /^pop/)  { $stack-=4; }
+
+    &generic($opcode,@_) or die "undefined subroutine \&$AUTOLOAD";
+}
+
+sub ::emit
+{ my $opcode=shift;
+
+    if ($#_==-1)    { push(@out,"\t$opcode\n");                                
}
+    else            { push(@out,"\t$opcode\t".join(',',@_)."\n");      }
+}
+
+sub ::LB
+{   $_[0] =~ m/^e?([a-d])x$/o or die "$_[0] does not have a 'low byte'";
+  $1."l";
+}
+sub ::HB
+{   $_[0] =~ m/^e?([a-d])x$/o or die "$_[0] does not have a 'high byte'";
+  $1."h";
+}
+sub ::stack_push{ my $num=$_[0]*4; $stack+=$num; &sub("esp",$num);     }
+sub ::stack_pop        { my $num=$_[0]*4; $stack-=$num; &add("esp",$num);      
}
+sub ::blindpop { &pop($_[0]); $stack+=4;                               }
+sub ::wparam   { &DWP($stack+4*$_[0],"esp");                           }
+sub ::swtmp    { &DWP(4*$_[0],"esp");                                  }
+
+sub ::bswap
+{   if ($i386) # emulate bswap for i386
+    {  &comment("bswap @_");
+       &xchg(&HB(@_),&LB(@_));
+       &ror (@_,16);
+       &xchg(&HB(@_),&LB(@_));
+    }
+    else
+    {  &generic("bswap",@_);   }
+}
+# These are made-up opcodes introduced over the years essentially
+# by ignorance, just alias them to real ones...
+sub ::movb     { &mov(@_);     }
+sub ::xorb     { &xor(@_);     }
+sub ::rotl     { &rol(@_);     }
+sub ::rotr     { &ror(@_);     }
+sub ::exch     { &xchg(@_);    }
+sub ::halt     { &hlt;         }
+sub ::movz     { &movzx(@_);   }
+sub ::pushf    { &pushfd;      }
+sub ::popf     { &popfd;       }
+
+# 3 argument instructions
+sub ::movq
+{ my($p1,$p2,$optimize)address@hidden;
+
+    if ($optimize && $p1=~/^mm[0-7]$/ && $p2=~/^mm[0-7]$/)
+    # movq between mmx registers can sink Intel CPUs
+    {  &::pshufw($p1,$p2,0xe4);                }
+    else
+    {  &::generic("movq",@_);                  }
+}
+
+# SSE>2 instructions
+my %regrm = (  "eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3,
+               "esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7  );
+sub ::pextrd
+{ my($dst,$src,$imm)address@hidden;
+    if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/)
+    {  &::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm); }
+    else
+    {  &::generic("pextrd",@_);                }
+}
+
+sub ::pinsrd
+{ my($dst,$src,$imm)address@hidden;
+    if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/)
+    {  &::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm); }
+    else
+    {  &::generic("pinsrd",@_);                }
+}
+
+sub ::pshufb
+{ my($dst,$src)address@hidden;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {  &data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2);        }
+    else
+    {  &::generic("pshufb",@_);                }
+}
+
+sub ::palignr
+{ my($dst,$src,$imm)address@hidden;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {  &::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm); }
+    else
+    {  &::generic("palignr",@_);               }
+}
+
+sub ::pclmulqdq
+{ my($dst,$src,$imm)address@hidden;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {  &::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm); }
+    else
+    {  &::generic("pclmulqdq",@_);             }
+}
+
+sub ::rdrand
+{ my ($dst)address@hidden;
+    if ($dst =~ /(e[a-dsd][ixp])/)
+    {  &::data_byte(0x0f,0xc7,0xf0|$regrm{$dst});      }
+    else
+    {  &::generic("rdrand",@_);        }
+}
+
+# label management
+$lbdecor="L";          # local label decoration, set by package
+$label="000";
+
+sub ::islabel          # see is argument is a known label
+{ my $i;
+    foreach $i (values %label) { return $i if ($i eq $_[0]); }
+  $label{$_[0]};       # can be undef
+}
+
+sub ::label            # instantiate a function-scope label
+{   if (!defined($label{$_[0]}))
+    {  $label{$_[0]}="${lbdecor}${label}${_[0]}"; $label++;   }
+  $label{$_[0]};
+}
+
+sub ::LABEL            # instantiate a file-scope label
+{   $label{$_[0]}=$_[1] if (!defined($label{$_[0]}));
+  $label{$_[0]};
+}
+
+sub ::static_label     { &::LABEL($_[0],$lbdecor.$_[0]); }
+
+sub ::set_label_B      { push(@out,"@_:\n"); }
+sub ::set_label
+{ my $label=&::label($_[0]);
+    &::align($_[1]) if ($_[1]>1);
+    &::set_label_B($label);
+  $label;
+}
+
+sub ::wipe_labels      # wipes function-scope labels
+{   foreach $i (keys %label)
+    {  delete $label{$i} if ($label{$i} =~ /^\Q${lbdecor}\E[0-9]{3}/); }
+}
+
+# subroutine management
+sub ::function_begin
+{   &function_begin_B(@_);
+    $stack=4;
+    &push("ebp");
+    &push("ebx");
+    &push("esi");
+    &push("edi");
+}
+
+sub ::function_end
+{   &pop("edi");
+    &pop("esi");
+    &pop("ebx");
+    &pop("ebp");
+    &ret();
+    &function_end_B(@_);
+    $stack=0;
+    &wipe_labels();
+}
+
+sub ::function_end_A
+{   &pop("edi");
+    &pop("esi");
+    &pop("ebx");
+    &pop("ebp");
+    &ret();
+    $stack+=16;        # readjust esp as if we didn't pop anything
+}
+
+sub ::asciz
+{ my @str=unpack("C*",shift);
+    push @str,0;
+    while ($#str>15) {
+       &data_byte(@str[0..15]);
+       foreach (0..15) { shift @str; }
+    }
+    &data_byte(@str) if (@str);
+}
+
+sub ::asm_finish
+{   &file_end();
+    print @out;
+}
+
+sub ::asm_init
+{ my ($type,$fn,$cpu)address@hidden;
+
+    $filename=$fn;
+    $i386=$cpu;
+
+    $elf=$cpp=$coff=$aout=$macosx=$win32=$netware=$mwerks=$android=0;
+    if    (($type eq "elf"))
+    {  $elf=1;                 require "x86gas.pl";    }
+    elsif (($type eq "a\.out"))
+    {  $aout=1;                require "x86gas.pl";    }
+    elsif (($type eq "coff" or $type eq "gaswin"))
+    {  $coff=1;                require "x86gas.pl";    }
+    elsif (($type eq "win32n"))
+    {  $win32=1;               require "x86nasm.pl";   }
+    elsif (($type eq "nw-nasm"))
+    {  $netware=1;             require "x86nasm.pl";   }
+    #elsif (($type eq "nw-mwasm"))
+    #{ $netware=1; $mwerks=1;  require "x86nasm.pl";   }
+    elsif (($type eq "win32"))
+    {  $win32=1;               require "x86masm.pl";   }
+    elsif (($type eq "macosx"))
+    {  $aout=1; $macosx=1;     require "x86gas.pl";    }
+    elsif (($type eq "android"))
+    {  $elf=1; $android=1;     require "x86gas.pl";    }
+    else
+    {  print STDERR <<"EOF";
+Pick one target type from
+       elf     - Linux, FreeBSD, Solaris x86, etc.
+       a.out   - DJGPP, elder OpenBSD, etc.
+       coff    - GAS/COFF such as Win32 targets
+       win32n  - Windows 95/Windows NT NASM format
+       nw-nasm - NetWare NASM format
+       macosx  - Mac OS X
+EOF
+       exit(1);
+    }
+
+    $pic=0;
+    for (@ARGV) { $pic=1 if (/\-[fK]PIC/i); }
+
+    $filename =~ s/\.pl$//;
+    &file($filename);
+}
+
+1;
diff --git a/devel/perlasm/x86gas.pl b/devel/perlasm/x86gas.pl
new file mode 100644
index 0000000..4af8718
--- /dev/null
+++ b/devel/perlasm/x86gas.pl
@@ -0,0 +1,255 @@
+#!/usr/bin/env perl
+
+package x86gas;
+
+*out=\@::out;
+
+$::lbdecor=$::aout?"L":".L";           # local label decoration
+$nmdecor=($::aout or $::coff)?"_":"";  # external name decoration
+
+$initseg="";
+
+$align=16;
+$align=log($align)/log(2) if ($::aout);
+$com_start="#" if ($::aout or $::coff);
+
+sub opsize()
+{ my $reg=shift;
+    if    ($reg =~ m/^%e/o)            { "l"; }
+    elsif ($reg =~ m/^%[a-d][hl]$/o)   { "b"; }
+    elsif ($reg =~ m/^%[xm]/o)         { undef; }
+    else                               { "w"; }
+}
+
+# swap arguments;
+# expand opcode with size suffix;
+# prefix numeric constants with $;
+sub ::generic
+{ my($opcode,@arg)address@hidden;
+  my($suffix,$dst,$src);
+
+    @arg=reverse(@arg);
+
+    for (@arg)
+    {  s/^(\*?)(e?[a-dsixphl]{2})$/$1%$2/o;    # gp registers
+       s/^([xy]?mm[0-7])$/%$1/o;               # xmm/mmx registers
+       s/^(\-?[0-9]+)$/\$$1/o;                 # constants
+       s/^(\-?0x[0-9a-f]+)$/\$$1/o;            # constants
+    }
+
+    $dst = $arg[$#arg]         if ($#arg>=0);
+    $src = $arg[$#arg-1]       if ($#arg>=1);
+    if    ($dst =~ m/^%/o)     { $suffix=&opsize($dst); }
+    elsif ($src =~ m/^%/o)     { $suffix=&opsize($src); }
+    else                       { $suffix="l";           }
+    undef $suffix if ($dst =~ m/^%[xm]/o || $src =~ m/^%[xm]/o);
+
+    if ($#_==0)                                { &::emit($opcode);             
}
+    elsif ($opcode =~ m/^j/o && $#_==1)        { &::emit($opcode,@arg);        
}
+    elsif ($opcode eq "call" && $#_==1)        { &::emit($opcode,@arg);        
}
+    elsif ($opcode eq "clflush" && $#_==1){ &::emit($opcode,@arg);     }
+    elsif ($opcode =~ m/^set/&& $#_==1)        { &::emit($opcode,@arg);        
}
+    else                               { &::emit($opcode.$suffix,@arg);}
+
+  1;
+}
+#
+# opcodes not covered by ::generic above, mostly inconsistent namings...
+#
+sub ::movzx    { &::movzb(@_);                 }
+sub ::pushfd   { &::pushfl;                    }
+sub ::popfd    { &::popfl;                     }
+sub ::cpuid    { &::emit(".byte\t0x0f,0xa2");  }
+sub ::rdtsc    { &::emit(".byte\t0x0f,0x31");  }
+
+sub ::call     { &::emit("call",(&::islabel($_[0]) or "$nmdecor$_[0]")); }
+sub ::call_ptr { &::generic("call","*$_[0]");  }
+sub ::jmp_ptr  { &::generic("jmp","*$_[0]");   }
+
+*::bswap = sub { &::emit("bswap","%$_[0]");    } if (!$::i386);
+
+sub ::DWP
+{ my($addr,$reg1,$reg2,$idx)address@hidden;
+  my $ret="";
+
+    $addr =~ s/^\s+//;
+    # prepend global references with optional underscore
+    $addr =~ s/^([^\+\-0-9][^\+\-]*)/&::islabel($1) or "$nmdecor$1"/ige;
+
+    $reg1 = "%$reg1" if ($reg1);
+    $reg2 = "%$reg2" if ($reg2);
+
+    $ret .= $addr if (($addr ne "") && ($addr ne 0));
+
+    if ($reg2)
+    {  $idx!= 0 or $idx=1;
+       $ret .= "($reg1,$reg2,$idx)";
+    }
+    elsif ($reg1)
+    {  $ret .= "($reg1)";      }
+
+  $ret;
+}
+sub ::QWP      { &::DWP(@_);   }
+sub ::BP       { &::DWP(@_);   }
+sub ::WP       { &::DWP(@_);   }
+sub ::BC       { @_;           }
+sub ::DWC      { @_;           }
+
+sub ::file
+{   push(@out,".file\t\"$_[0].s\"\n.text\n");  }
+
+sub ::function_begin_B
+{ my $func=shift;
+  my $global=($func !~ /^_/);
+  my $begin="${::lbdecor}_${func}_begin";
+
+    &::LABEL($func,$global?"$begin":"$nmdecor$func");
+    $func=$nmdecor.$func;
+
+    push(@out,".globl\t$func\n")       if ($global);
+    if ($::coff)
+    {  
push(@out,".def\t$func;\t.scl\t".(3-$global).";\t.type\t32;\t.endef\n"); }
+    elsif (($::aout and !$::pic) or $::macosx)
+    { }
+    else
+    {  push(@out,".type        $func,address@hidden"); }
+    push(@out,".align\t$align\n");
+    push(@out,"$func:\n");
+    push(@out,"$begin:\n")             if ($global);
+    $::stack=4;
+}
+
+sub ::function_end_B
+{ my $func=shift;
+    push(@out,".size\t$nmdecor$func,.-".&::LABEL($func)."\n") if ($::elf);
+    $::stack=0;
+    &::wipe_labels();
+}
+
+sub ::comment
+       {
+       if (!defined($com_start) or $::elf)
+               {       # Regarding $::elf above...
+                       # GNU and SVR4 as'es use different comment delimiters,
+               push(@out,"\n");        # so we just skip ELF comments...
+               return;
+               }
+       foreach (@_)
+               {
+               if (/^\s*$/)
+                       { push(@out,"\n"); }
+               else
+                       { push(@out,"\t$com_start $_ $com_end\n"); }
+               }
+       }
+
+sub ::external_label
+{   foreach(@_) { &::LABEL($_,$nmdecor.$_); }   }
+
+sub ::public_label
+{   push(@out,".globl\t".&::LABEL($_[0],$nmdecor.$_[0])."\n");   }
+
+sub ::file_end
+{   if ($::macosx)
+    {  if (%non_lazy_ptr)
+       {   push(@out,".section 
__IMPORT,__pointers,non_lazy_symbol_pointers\n");
+           foreach $i (keys %non_lazy_ptr)
+           {   
push(@out,"$non_lazy_ptr{$i}:\n.indirect_symbol\t$i\n.long\t0\n");   }
+       }
+    }
+    if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) {
+       my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,8";
+       if ($::macosx)  { push (@out,"$tmp,2\n"); }
+       elsif ($::elf)  { push (@out,"$tmp,4\n"); }
+       else            { push (@out,"$tmp\n"); }
+    }
+    push(@out,$initseg) if ($initseg);
+}
+
+sub ::data_byte        {   push(@out,".byte\t".join(',',@_)."\n");   }
+sub ::data_short{   push(@out,".value\t".join(',',@_)."\n");  }
+sub ::data_word {   push(@out,".long\t".join(',',@_)."\n");   }
+
+sub ::align
+{ my $val=$_[0],$p2,$i;
+    if ($::aout)
+    {  for ($p2=0;$val!=0;$val>>=1) { $p2++; }
+       $val=$p2-1;
+       $val.=",0x90";
+    }
+    push(@out,".align\t$val\n");
+}
+
+sub ::picmeup
+{ my($dst,$sym,$base,$reflabel)address@hidden;
+
+    if (($::pic && ($::elf || $::aout)) || $::macosx)
+    {  if (!defined($base))
+       {   &::call(&::label("PIC_me_up"));
+           &::set_label("PIC_me_up");
+           &::blindpop($dst);
+           $base=$dst;
+           $reflabel=&::label("PIC_me_up");
+       }
+       if ($::macosx)
+       {   my $indirect=&::static_label("$nmdecor$sym\$non_lazy_ptr");
+           &::mov($dst,&::DWP("$indirect-$reflabel",$base));
+           $non_lazy_ptr{"$nmdecor$sym"}=$indirect;
+       }
+       else
+       {   &::lea($dst,&::DWP("_GLOBAL_OFFSET_TABLE_+[.-$reflabel]",
+                           $base));
+           &::mov($dst,&::DWP("address@hidden",$dst));
+       }
+    }
+    else
+    {  &::lea($dst,&::DWP($sym));      }
+}
+
+sub ::initseg
+{ my $f=$nmdecor.shift;
+
+    if ($::android)
+    {  $initseg.=<<___;
+.section       .init_array
+.align 4
+.long  $f
+___
+    }
+    elsif ($::elf)
+    {  $initseg.=<<___;
+.section       .init
+       call    $f
+___
+    }
+    elsif ($::coff)
+    {   $initseg.=<<___;       # applies to both Cygwin and Mingw
+.section       .ctors
+.long  $f
+___
+    }
+    elsif ($::macosx)
+    {  $initseg.=<<___;
+.mod_init_func
+.align 2
+.long   $f
+___
+    }
+    elsif ($::aout)
+    {  my $ctor="${nmdecor}_GLOBAL_\$I\$$f";
+       $initseg.=".text\n";
+       $initseg.=".type        $ctor,address@hidden" if ($::pic);
+       $initseg.=<<___;        # OpenBSD way...
+.globl $ctor
+.align 2
+$ctor:
+       jmp     $f
+___
+    }
+}
+
+sub ::dataseg
+{   push(@out,".data\n");   }
+
+1;
diff --git a/devel/perlasm/x86masm.pl b/devel/perlasm/x86masm.pl
new file mode 100644
index 0000000..ee446de
--- /dev/null
+++ b/devel/perlasm/x86masm.pl
@@ -0,0 +1,196 @@
+#!/usr/bin/env perl
+
+package x86masm;
+
+*out=\@::out;
+
+$::lbdecor="\$L";      # local label decoration
+$nmdecor="_";          # external name decoration
+
+$initseg="";
+$segment="";
+
+sub ::generic
+{ my ($opcode,@arg)address@hidden;
+
+    # fix hexadecimal constants
+    for (@arg) { s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/oi; }
+
+    if ($opcode !~ /movq/)
+    {  # fix xmm references
+       $arg[0] =~ s/\b[A-Z]+WORD\s+PTR/XMMWORD PTR/i if 
($arg[1]=~/\bxmm[0-7]\b/i);
+       $arg[1] =~ s/\b[A-Z]+WORD\s+PTR/XMMWORD PTR/i if 
($arg[0]=~/\bxmm[0-7]\b/i);
+    }
+
+    &::emit($opcode,@arg);
+  1;
+}
+#
+# opcodes not covered by ::generic above, mostly inconsistent namings...
+#
+sub ::call     { &::emit("call",(&::islabel($_[0]) or "$nmdecor$_[0]")); }
+sub ::call_ptr { &::emit("call",@_);   }
+sub ::jmp_ptr  { &::emit("jmp",@_);    }
+sub ::lock     { &::data_byte(0xf0);   }
+
+sub get_mem
+{ my($size,$addr,$reg1,$reg2,$idx)address@hidden;
+  my($post,$ret);
+
+    $ret .= "$size PTR " if ($size ne "");
+
+    $addr =~ s/^\s+//;
+    # prepend global references with optional underscore
+    $addr =~ s/^([^\+\-0-9][^\+\-]*)/&::islabel($1) or "$nmdecor$1"/ige;
+    # put address arithmetic expression in parenthesis
+    $addr="($addr)" if ($addr =~ /^.+[\-\+].+$/);
+
+    if (($addr ne "") && ($addr ne 0))
+    {  if ($addr !~ /^-/)      { $ret .= "$addr";  }
+       else                    { $post=$addr;      }
+    }
+    $ret .= "[";
+
+    if ($reg2 ne "")
+    {  $idx!=0 or $idx=1;
+       $ret .= "$reg2*$idx";
+       $ret .= "+$reg1" if ($reg1 ne "");
+    }
+    else
+    {  $ret .= "$reg1";   }
+
+    $ret .= "$post]";
+    $ret =~ s/\+\]/]/; # in case $addr was the only argument
+    $ret =~ s/\[\s*\]//;
+
+  $ret;
+}
+sub ::BP       { &get_mem("BYTE",@_);  }
+sub ::WP       { &get_mem("WORD",@_);  }
+sub ::DWP      { &get_mem("DWORD",@_); }
+sub ::QWP      { &get_mem("QWORD",@_); }
+sub ::BC       { "@_";  }
+sub ::DWC      { "@_"; }
+
+sub ::file
+{ my $tmp=<<___;
+TITLE  $_[0].asm
+IF address@hidden LT 800
+ECHO MASM version 8.00 or later is strongly recommended.
+ENDIF
+.486
+.MODEL FLAT
+OPTION DOTNAME
+IF address@hidden LT 800
+.text\$        SEGMENT PAGE 'CODE'
+ELSE
+.text\$        SEGMENT ALIGN(64) 'CODE'
+ENDIF
+___
+    push(@out,$tmp);
+    $segment = ".text\$";
+}
+
+sub ::function_begin_B
+{ my $func=shift;
+  my $global=($func !~ /^_/);
+  my $begin="${::lbdecor}_${func}_begin";
+
+    &::LABEL($func,$global?"$begin":"$nmdecor$func");
+    $func="ALIGN\t16\n".$nmdecor.$func."\tPROC";
+
+    if ($global)    { $func.=" PUBLIC\n${begin}::\n"; }
+    else           { $func.=" PRIVATE\n";            }
+    push(@out,$func);
+    $::stack=4;
+}
+sub ::function_end_B
+{ my $func=shift;
+
+    push(@out,"$nmdecor$func ENDP\n");
+    $::stack=0;
+    &::wipe_labels();
+}
+
+sub ::file_end
+{ my $xmmheader=<<___;
+.686
+.XMM
+IF address@hidden LT 800
+XMMWORD STRUCT 16
+DQ     2 dup (?)
+XMMWORD        ENDS
+ENDIF
+___
+    if (grep {/\b[x]?mm[0-7]\b/i} @out) {
+       grep {s/\.[3-7]86/$xmmheader/} @out;
+    }
+
+    push(@out,"$segment        ENDS\n");
+
+    if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out)
+    {  my $comm=<<___;
+.bss   SEGMENT 'BSS'
+COMM   ${nmdecor}OPENSSL_ia32cap_P:QWORD
+.bss   ENDS
+___
+       # comment out OPENSSL_ia32cap_P declarations
+       grep {s/(^EXTERN\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out;
+       push (@out,$comm);
+    }
+    push (@out,$initseg) if ($initseg);
+    push (@out,"END\n");
+}
+
+sub ::comment {   foreach (@_) { push(@out,"\t; $_\n"); }   }
+
+*::set_label_B = sub
+{ my $l=shift; push(@out,$l.($l=~/^\Q${::lbdecor}\E[0-9]{3}/?":\n":"::\n")); };
+
+sub ::external_label
+{   foreach(@_)
+    {  push(@out, "EXTERN\t".&::LABEL($_,$nmdecor.$_).":NEAR\n");   }
+}
+
+sub ::public_label
+{   push(@out,"PUBLIC\t".&::LABEL($_[0],$nmdecor.$_[0])."\n");   }
+
+sub ::data_byte
+{   push(@out,("DB\t").join(',',@_)."\n");     }
+
+sub ::data_short
+{   push(@out,("DW\t").join(',',@_)."\n");     }
+
+sub ::data_word
+{   push(@out,("DD\t").join(',',@_)."\n");     }
+
+sub ::align
+{   push(@out,"ALIGN\t$_[0]\n");       }
+
+sub ::picmeup
+{ my($dst,$sym)address@hidden;
+    &::lea($dst,&::DWP($sym));
+}
+
+sub ::initseg
+{ my $f=$nmdecor.shift;
+
+    $initseg.=<<___;
+.CRT\$XCU      SEGMENT DWORD PUBLIC 'DATA'
+EXTERN $f:NEAR
+DD     $f
+.CRT\$XCU      ENDS
+___
+}
+
+sub ::dataseg
+{   push(@out,"$segment\tENDS\n_DATA\tSEGMENT\n"); $segment="_DATA";   }
+
+sub ::safeseh
+{ my $nm=shift;
+    push(@out,"IF address@hidden GE 710\n");
+    push(@out,".SAFESEH        ".&::LABEL($nm,$nmdecor.$nm)."\n");
+    push(@out,"ENDIF\n");
+}
+
+1;
diff --git a/devel/perlasm/x86nasm.pl b/devel/perlasm/x86nasm.pl
new file mode 100644
index 0000000..ca2511c
--- /dev/null
+++ b/devel/perlasm/x86nasm.pl
@@ -0,0 +1,177 @@
+#!/usr/bin/env perl
+
+package x86nasm;
+
+*out=\@::out;
+
+$::lbdecor="L\$";              # local label decoration
+$nmdecor=$::netware?"":"_";    # external name decoration
+$drdecor=$::mwerks?".":"";     # directive decoration
+
+$initseg="";
+
+sub ::generic
+{ my $opcode=shift;
+  my $tmp;
+
+    if (!$::mwerks)
+    {   if    ($opcode =~ m/^j/o && $#_==0) # optimize jumps
+       {   $_[0] = "NEAR $_[0]";       }
+       elsif ($opcode eq "lea" && $#_==1)  # wipe storage qualifier from lea
+       {   $_[1] =~ s/^[^\[]*\[/\[/o;  }
+       elsif ($opcode eq "clflush" && $#_==0)
+       {   $_[0] =~ s/^[^\[]*\[/\[/o;  }
+    }
+    &::emit($opcode,@_);
+  1;
+}
+#
+# opcodes not covered by ::generic above, mostly inconsistent namings...
+#
+sub ::call     { &::emit("call",(&::islabel($_[0]) or "$nmdecor$_[0]")); }
+sub ::call_ptr { &::emit("call",@_);   }
+sub ::jmp_ptr  { &::emit("jmp",@_);    }
+
+sub get_mem
+{ my($size,$addr,$reg1,$reg2,$idx)address@hidden;
+  my($post,$ret);
+
+    if ($size ne "")
+    {  $ret .= "$size";
+       $ret .= " PTR" if ($::mwerks);
+       $ret .= " ";
+    }
+    $ret .= "[";
+
+    $addr =~ s/^\s+//;
+    # prepend global references with optional underscore
+    $addr =~ s/^([^\+\-0-9][^\+\-]*)/::islabel($1) or "$nmdecor$1"/ige;
+    # put address arithmetic expression in parenthesis
+    $addr="($addr)" if ($addr =~ /^.+[\-\+].+$/);
+
+    if (($addr ne "") && ($addr ne 0))
+    {  if ($addr !~ /^-/)      { $ret .= "$addr+"; }
+       else                    { $post=$addr;      }
+    }
+
+    if ($reg2 ne "")
+    {  $idx!=0 or $idx=1;
+       $ret .= "$reg2*$idx";
+       $ret .= "+$reg1" if ($reg1 ne "");
+    }
+    else
+    {  $ret .= "$reg1";   }
+
+    $ret .= "$post]";
+    $ret =~ s/\+\]/]/; # in case $addr was the only argument
+
+  $ret;
+}
+sub ::BP       { &get_mem("BYTE",@_);  }
+sub ::DWP      { &get_mem("DWORD",@_); }
+sub ::WP       { &get_mem("WORD",@_);  }
+sub ::QWP      { &get_mem("",@_);      }
+sub ::BC       { (($::mwerks)?"":"BYTE ")."@_";  }
+sub ::DWC      { (($::mwerks)?"":"DWORD ")."@_"; }
+
+sub ::file
+{   if ($::mwerks)     { push(@out,".section\t.text,64\n"); }
+    else
+    { my $tmp=<<___;
+%ifidn __OUTPUT_FORMAT__,obj
+section        code    use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
address@hidden equ 1
+section        .text   code align=64
+%else
+section        .text   code
+%endif
+___
+       push(@out,$tmp);
+    }
+}
+
+sub ::function_begin_B
+{ my $func=shift;
+  my $global=($func !~ /^_/);
+  my $begin="${::lbdecor}_${func}_begin";
+
+    $begin =~ s/^\@/./ if ($::mwerks); # the torture never stops
+
+    &::LABEL($func,$global?"$begin":"$nmdecor$func");
+    $func=$nmdecor.$func;
+
+    push(@out,"${drdecor}global        $func\n")       if ($global);
+    push(@out,"${drdecor}align 16\n");
+    push(@out,"$func:\n");
+    push(@out,"$begin:\n")                     if ($global);
+    $::stack=4;
+}
+
+sub ::function_end_B
+{   $::stack=0;
+    &::wipe_labels();
+}
+
+sub ::file_end
+{   if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out)
+    {  my $comm=<<___;
+${drdecor}segment      .bss
+${drdecor}common       ${nmdecor}OPENSSL_ia32cap_P 8
+___
+       # comment out OPENSSL_ia32cap_P declarations
+       grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out;
+       push (@out,$comm)
+    }
+    push (@out,$initseg) if ($initseg);                
+}
+
+sub ::comment {   foreach (@_) { push(@out,"\t; $_\n"); }   }
+
+sub ::external_label
+{   foreach(@_)
+    {  push(@out,"${drdecor}extern\t".&::LABEL($_,$nmdecor.$_)."\n");   }
+}
+
+sub ::public_label
+{   push(@out,"${drdecor}global\t".&::LABEL($_[0],$nmdecor.$_[0])."\n");  }
+
+sub ::data_byte
+{   push(@out,(($::mwerks)?".byte\t":"db\t").join(',',@_)."\n");       }
+sub ::data_short
+{   push(@out,(($::mwerks)?".word\t":"dw\t").join(',',@_)."\n");       }
+sub ::data_word
+{   push(@out,(($::mwerks)?".long\t":"dd\t").join(',',@_)."\n");       }
+
+sub ::align
+{   push(@out,"${drdecor}align\t$_[0]\n");     }
+
+sub ::picmeup
+{ my($dst,$sym)address@hidden;
+    &::lea($dst,&::DWP($sym));
+}
+
+sub ::initseg
+{ my $f=$nmdecor.shift;
+    if ($::win32)
+    {  $initseg=<<___;
+segment        .CRT\$XCU data align=4
+extern $f
+dd     $f
+___
+    }
+}
+
+sub ::dataseg
+{   if ($mwerks)       { push(@out,".section\t.data,4\n");   }
+    else               { push(@out,"section\t.data align=4\n"); }
+}
+
+sub ::safeseh
+{ my $nm=shift;
+    push(@out,"%if     __NASM_VERSION_ID__ >= 0x02030000\n");
+    push(@out,"safeseh ".&::LABEL($nm,$nmdecor.$nm)."\n");
+    push(@out,"%endif\n");
+}
+
+1;
diff --git a/lib/accelerated/x86/asm-coff/appro-aes-x86-coff.s 
b/lib/accelerated/x86/asm-coff/appro-aes-x86-coff.s
index 2f9a6bc..74e236b 100644
--- a/lib/accelerated/x86/asm-coff/appro-aes-x86-coff.s
+++ b/lib/accelerated/x86/asm-coff/appro-aes-x86-coff.s
@@ -35,7 +35,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-.file  "aesni-x86.s"
+.file  "devel/perlasm/aesni-x86.s"
 .text
 .globl _aesni_encrypt
 .def   _aesni_encrypt; .scl    2;      .type   32;     .endef
diff --git a/lib/accelerated/x86/asm-coff/cpuid-x86-64-coff.s 
b/lib/accelerated/x86/asm-coff/cpuid-x86-64-coff.s
index d2336e7..3ca96a9 100644
--- a/lib/accelerated/x86/asm-coff/cpuid-x86-64-coff.s
+++ b/lib/accelerated/x86/asm-coff/cpuid-x86-64-coff.s
@@ -18,39 +18,37 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
-
-       .file "cpuid.asm"
-        
-       .text
-.globl __gnutls_cpuid
-.def   __gnutls_cpuid; .scl 2; .type 32;       .endef
-.p2align 4
-__gnutls_cpuid:
+.text  
+.globl _gnutls_cpuid
+.def   _gnutls_cpuid;  .scl 2; .type 32;       .endef
+.p2align       4
+_gnutls_cpuid:
        pushq   %rbp
-       movq    %rsp, %rbp
+       movq    %rsp,%rbp
        pushq   %rbx
-       movl    %edi, -12(%rbp)
-       movq    %rsi, -24(%rbp)
-       movq    %rdx, -32(%rbp)
-       movq    %rcx, -40(%rbp)
-       movq    %r8, -48(%rbp)
-       movl    -12(%rbp), %eax
-       movl    %eax, -60(%rbp)
-       movl    -60(%rbp), %eax
+       movl    %edi,-12(%rbp)
+       movq    %rsi,-24(%rbp)
+       movq    %rdx,-32(%rbp)
+       movq    %rcx,-40(%rbp)
+       movq    %r8,-48(%rbp)
+       movl    -12(%rbp),%eax
+       movl    %eax,-60(%rbp)
+       movl    -60(%rbp),%eax
        cpuid
-       movl    %edx, -56(%rbp)
-       movl    %ecx, %esi
-       movl    %eax, -52(%rbp)
-       movq    -24(%rbp), %rax
-       movl    -52(%rbp), %edx
-       movl    %edx, (%rax)
-       movq    -32(%rbp), %rax
-       movl    %ebx, (%rax)
-       movq    -40(%rbp), %rax
-       movl    %esi, (%rax)
-       movq    -48(%rbp), %rax
-       movl    -56(%rbp), %ecx
-       movl    %ecx, (%rax)
+       movl    %edx,-56(%rbp)
+       movl    %ecx,%esi
+       movl    %eax,-52(%rbp)
+       movq    -24(%rbp),%rax
+       movl    -52(%rbp),%edx
+       movl    %edx,(%rax)
+       movq    -32(%rbp),%rax
+       movl    %ebx,(%rax)
+       movq    -40(%rbp),%rax
+       movl    %esi,(%rax)
+       movq    -48(%rbp),%rax
+       movl    -56(%rbp),%ecx
+       movl    %ecx,(%rax)
        popq    %rbx
        leave
-       ret
+       .byte   0xf3,0xc3
+
diff --git a/lib/accelerated/x86/asm-coff/cpuid-x86-coff.s 
b/lib/accelerated/x86/asm-coff/cpuid-x86-coff.s
index 92b95db..076b193 100644
--- a/lib/accelerated/x86/asm-coff/cpuid-x86-coff.s
+++ b/lib/accelerated/x86/asm-coff/cpuid-x86-coff.s
@@ -18,51 +18,47 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
-
-       .file "cpuid.asm"
-        
-       .text
-.globl __gnutls_cpuid
-.def   __gnutls_cpuid; .scl    2;      .type   32;     .endef
-.align 16
+.file  "devel/perlasm/cpuid-x86.s"
+.text
+.def   __gnutls_cpuid; .scl    3;      .type   32;     .endef
+.align 16
 __gnutls_cpuid:
        pushl   %ebp
-       movl    %esp, %ebp
-       subl    $12, %esp
-       movl    %ebx, (%esp)
-       movl    8(%ebp), %eax
-       movl    %esi, 4(%esp)
-       movl    %edi, 8(%esp)
-       pushl %ebx
-       cpuid
-       movl %ebx, %edi
-       popl %ebx
-       movl    %edx, %esi
-       movl    12(%ebp), %edx
-       movl    %eax, (%edx)
-       movl    16(%ebp), %eax
-       movl    %edi, (%eax)
-       movl    20(%ebp), %eax
-       movl    %ecx, (%eax)
-       movl    24(%ebp), %eax
-       movl    %esi, (%eax)
-       movl    (%esp), %ebx
-       movl    4(%esp), %esi
-       movl    8(%esp), %edi
-       movl    %ebp, %esp
+       movl    %esp,%ebp
+       subl    $12,%esp
+       movl    %ebx,(%esp)
+       movl    8(%ebp),%eax
+       movl    %esi,4(%esp)
+       movl    %edi,8(%esp)
+       pushl   %ebx
+       .byte   0x0f,0xa2
+       movl    %ebx,%edi
+       popl    %ebx
+       movl    %edx,%esi
+       movl    12(%ebp),%edx
+       movl    %eax,(%edx)
+       movl    16(%ebp),%eax
+       movl    %edi,(%eax)
+       movl    20(%ebp),%eax
+       movl    %ecx,(%eax)
+       movl    24(%ebp),%eax
+       movl    %esi,(%eax)
+       movl    (%esp),%ebx
+       movl    4(%esp),%esi
+       movl    8(%esp),%edi
+       movl    %ebp,%esp
        popl    %ebp
        ret
-
-.globl __gnutls_have_cpuid
-.def   __gnutls_have_cpuid;    .scl    2;      .type   32;     .endef
+.def   __gnutls_have_cpuid;    .scl    3;      .type   32;     .endef
 .align 16
 __gnutls_have_cpuid:
-       pushfl  
-       pop %eax        
-       orl $0x200000, %eax     
-       push %eax       
-       popfl   
-       pushfl  
-       pop %eax        
-       andl $0x200000, %eax    
+       pushfl
+       popl    %eax
+       orl     $2097152,%eax
+       pushl   %eax
+       popfl
+       pushfl
+       popl    %eax
+       andl    $2097152,%eax
        ret
+.byte  67,80,85,73,68,32,102,111,114,32,120,56,54,0
diff --git a/lib/accelerated/x86/asm-coff/padlock-x86-coff.s 
b/lib/accelerated/x86/asm-coff/padlock-x86-coff.s
index c1014bd..c9231f1 100644
--- a/lib/accelerated/x86/asm-coff/padlock-x86-coff.s
+++ b/lib/accelerated/x86/asm-coff/padlock-x86-coff.s
@@ -35,7 +35,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-.file  "./engines/asm/e_padlock-x86.s"
+.file  "devel/perlasm/e_padlock-x86.s"
 .text
 .globl _padlock_capability
 .def   _padlock_capability;    .scl    2;      .type   32;     .endef
diff --git a/lib/accelerated/x86/asm/appro-aes-gcm-x86-64.s 
b/lib/accelerated/x86/asm/appro-aes-gcm-x86-64.s
index 620cf47..55da343 100644
--- a/lib/accelerated/x86/asm/appro-aes-gcm-x86-64.s
+++ b/lib/accelerated/x86/asm/appro-aes-gcm-x86-64.s
@@ -35,7 +35,6 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 .text  
 
 .globl gcm_gmult_4bit
@@ -1063,6 +1062,4 @@ gcm_ghash_clmul:
 .byte  
71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align 64
 
-#if defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
-#endif
diff --git a/lib/accelerated/x86/asm/appro-aes-x86-64.s 
b/lib/accelerated/x86/asm/appro-aes-x86-64.s
index efd6375..73c3798 100644
--- a/lib/accelerated/x86/asm/appro-aes-x86-64.s
+++ b/lib/accelerated/x86/asm/appro-aes-x86-64.s
@@ -789,6 +789,7 @@ aesni_ccm64_encrypt_blocks:
        movdqu  (%r9),%xmm3
        movdqa  %xmm9,%xmm2
        movl    %eax,%r10d
+.byte  102,68,15,56,0,207
        jmp     .Lccm64_enc_outer
 .align 16
 .Lccm64_enc_outer:
@@ -813,7 +814,6 @@ aesni_ccm64_encrypt_blocks:
 .byte  102,15,56,220,216
        movups  0(%rcx),%xmm0
        jnz     .Lccm64_enc2_loop
-.byte  102,68,15,56,0,207
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
        paddq   %xmm6,%xmm9
@@ -826,7 +826,7 @@ aesni_ccm64_encrypt_blocks:
        movdqa  %xmm9,%xmm2
        movups  %xmm8,(%rsi)
        leaq    16(%rsi),%rsi
-.byte  102,68,15,56,0,207
+.byte  102,15,56,0,215
        jnz     .Lccm64_enc_outer
 
        movups  %xmm3,(%r9)
@@ -859,7 +859,6 @@ aesni_ccm64_decrypt_blocks:
 .byte  102,15,56,221,209
        movups  (%rdi),%xmm8
        paddq   %xmm6,%xmm9
-.byte  102,68,15,56,0,207
        leaq    16(%rdi),%rdi
        jmp     .Lccm64_dec_outer
 .align 16
@@ -869,6 +868,7 @@ aesni_ccm64_decrypt_blocks:
        movl    %r10d,%eax
        movups  %xmm8,(%rsi)
        leaq    16(%rsi),%rsi
+.byte  102,15,56,0,215
 
        subq    $1,%rdx
        jz      .Lccm64_dec_break
@@ -896,7 +896,6 @@ aesni_ccm64_decrypt_blocks:
        paddq   %xmm6,%xmm9
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-.byte  102,68,15,56,0,207
        leaq    16(%rdi),%rdi
 .byte  102,15,56,221,208
 .byte  102,15,56,221,216
@@ -2572,7 +2571,4 @@ __aesni_set_encrypt_key:
 .byte  
65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align 64
 
-#if defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/lib/accelerated/x86/asm/appro-aes-x86.s 
b/lib/accelerated/x86/asm/appro-aes-x86.s
index b1ce9bc..4dd1a50 100644
--- a/lib/accelerated/x86/asm/appro-aes-x86.s
+++ b/lib/accelerated/x86/asm/appro-aes-x86.s
@@ -35,7 +35,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-.file  "aesni-x86.s"
+.file  "devel/perlasm/aesni-x86.s"
 .text
 .globl aesni_encrypt
 .type  aesni_encrypt,@function
@@ -596,9 +596,10 @@ aesni_ccm64_encrypt_blocks:
        movl    %ebp,28(%esp)
        shrl    $1,%ecx
        leal    (%edx),%ebp
+       movdqa  (%esp),%xmm5
        movdqa  %xmm7,%xmm2
        movl    %ecx,%ebx
-       movdqa  (%esp),%xmm5
+.byte  102,15,56,0,253
 .L026ccm64_enc_outer:
        movups  (%ebp),%xmm0
        movl    %ebx,%ecx
@@ -619,7 +620,6 @@ aesni_ccm64_encrypt_blocks:
 .byte  102,15,56,220,216
        movups  (%edx),%xmm0
        jnz     .L027ccm64_enc2_loop
-.byte  102,15,56,0,253
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
        paddq   16(%esp),%xmm7
@@ -631,7 +631,7 @@ aesni_ccm64_encrypt_blocks:
        movdqa  %xmm7,%xmm2
        movups  %xmm6,(%edi)
        leal    16(%edi),%edi
-.byte  102,15,56,0,253
+.byte  102,15,56,0,213
        jnz     .L026ccm64_enc_outer
        movl    48(%esp),%esp
        movl    40(%esp),%edi
@@ -692,7 +692,6 @@ aesni_ccm64_decrypt_blocks:
 .byte  102,15,56,221,209
        movups  (%esi),%xmm6
        paddq   16(%esp),%xmm7
-.byte  102,15,56,0,253
        leal    16(%esi),%esi
        jmp     .L029ccm64_dec_outer
 .align 16
@@ -702,6 +701,7 @@ aesni_ccm64_decrypt_blocks:
        movl    %ebx,%ecx
        movups  %xmm6,(%edi)
        leal    16(%edi),%edi
+.byte  102,15,56,0,213
        subl    $1,%eax
        jz      .L030ccm64_dec_break
        movups  (%ebp),%xmm0
@@ -726,7 +726,6 @@ aesni_ccm64_decrypt_blocks:
        paddq   16(%esp),%xmm7
 .byte  102,15,56,220,209
 .byte  102,15,56,220,217
-.byte  102,15,56,0,253
        leal    16(%esi),%esi
 .byte  102,15,56,221,208
 .byte  102,15,56,221,216
@@ -2180,7 +2179,4 @@ aesni_set_decrypt_key:
 .byte  32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
 .byte  115,108,46,111,114,103,62,0
 
-#if defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/lib/accelerated/x86/asm/cpuid-x86-64.s 
b/lib/accelerated/x86/asm/cpuid-x86-64.s
index 09755ea..db6a580 100644
--- a/lib/accelerated/x86/asm/cpuid-x86-64.s
+++ b/lib/accelerated/x86/asm/cpuid-x86-64.s
@@ -18,45 +18,39 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
-
-       .file "cpuid.asm"
-        
-       .text
-       .align 16
-.globl _gnutls_cpuid
-.type _gnutls_cpuid,%function
+.text  
+.globl _gnutls_cpuid
+.type  _gnutls_cpuid,@function
+.align 16
 _gnutls_cpuid:
        pushq   %rbp
-       movq    %rsp, %rbp
+       movq    %rsp,%rbp
        pushq   %rbx
-       movl    %edi, -12(%rbp)
-       movq    %rsi, -24(%rbp)
-       movq    %rdx, -32(%rbp)
-       movq    %rcx, -40(%rbp)
-       movq    %r8, -48(%rbp)
-       movl    -12(%rbp), %eax
-       movl    %eax, -60(%rbp)
-       movl    -60(%rbp), %eax
+       movl    %edi,-12(%rbp)
+       movq    %rsi,-24(%rbp)
+       movq    %rdx,-32(%rbp)
+       movq    %rcx,-40(%rbp)
+       movq    %r8,-48(%rbp)
+       movl    -12(%rbp),%eax
+       movl    %eax,-60(%rbp)
+       movl    -60(%rbp),%eax
        cpuid
-       movl    %edx, -56(%rbp)
-       movl    %ecx, %esi
-       movl    %eax, -52(%rbp)
-       movq    -24(%rbp), %rax
-       movl    -52(%rbp), %edx
-       movl    %edx, (%rax)
-       movq    -32(%rbp), %rax
-       movl    %ebx, (%rax)
-       movq    -40(%rbp), %rax
-       movl    %esi, (%rax)
-       movq    -48(%rbp), %rax
-       movl    -56(%rbp), %ecx
-       movl    %ecx, (%rax)
+       movl    %edx,-56(%rbp)
+       movl    %ecx,%esi
+       movl    %eax,-52(%rbp)
+       movq    -24(%rbp),%rax
+       movl    -52(%rbp),%edx
+       movl    %edx,(%rax)
+       movq    -32(%rbp),%rax
+       movl    %ebx,(%rax)
+       movq    -40(%rbp),%rax
+       movl    %esi,(%rax)
+       movq    -48(%rbp),%rax
+       movl    -56(%rbp),%ecx
+       movl    %ecx,(%rax)
        popq    %rbx
        leave
-       ret
-.size _gnutls_cpuid, . - _gnutls_cpuid
-
+       .byte   0xf3,0xc3
+.size  _gnutls_cpuid,.-_gnutls_cpuid
 
-#if defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
-#endif
diff --git a/lib/accelerated/x86/asm/cpuid-x86.s 
b/lib/accelerated/x86/asm/cpuid-x86.s
index bf3e6ac..2d28bdc 100644
--- a/lib/accelerated/x86/asm/cpuid-x86.s
+++ b/lib/accelerated/x86/asm/cpuid-x86.s
@@ -18,60 +18,51 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
-
-       .file "cpuid.asm"
-        
-       .text
-       .align 16
-.globl _gnutls_cpuid
-.type _gnutls_cpuid,%function
+.file  "devel/perlasm/cpuid-x86.s"
+.text
+.type  _gnutls_cpuid,@function
+.align 16
 _gnutls_cpuid:
        pushl   %ebp
-       movl    %esp, %ebp
-       subl    $12, %esp
-       movl    %ebx, (%esp)
-       movl    8(%ebp), %eax
-       movl    %esi, 4(%esp)
-       movl    %edi, 8(%esp)
-       pushl %ebx
-       cpuid
-       movl %ebx, %edi
-       popl %ebx
-       movl    %edx, %esi
-       movl    12(%ebp), %edx
-       movl    %eax, (%edx)
-       movl    16(%ebp), %eax
-       movl    %edi, (%eax)
-       movl    20(%ebp), %eax
-       movl    %ecx, (%eax)
-       movl    24(%ebp), %eax
-       movl    %esi, (%eax)
-       movl    (%esp), %ebx
-       movl    4(%esp), %esi
-       movl    8(%esp), %edi
-       movl    %ebp, %esp
+       movl    %esp,%ebp
+       subl    $12,%esp
+       movl    %ebx,(%esp)
+       movl    8(%ebp),%eax
+       movl    %esi,4(%esp)
+       movl    %edi,8(%esp)
+       pushl   %ebx
+       .byte   0x0f,0xa2
+       movl    %ebx,%edi
+       popl    %ebx
+       movl    %edx,%esi
+       movl    12(%ebp),%edx
+       movl    %eax,(%edx)
+       movl    16(%ebp),%eax
+       movl    %edi,(%eax)
+       movl    20(%ebp),%eax
+       movl    %ecx,(%eax)
+       movl    24(%ebp),%eax
+       movl    %esi,(%eax)
+       movl    (%esp),%ebx
+       movl    4(%esp),%esi
+       movl    8(%esp),%edi
+       movl    %ebp,%esp
        popl    %ebp
        ret
-.size _gnutls_cpuid, . - _gnutls_cpuid
-
-       .globl  _gnutls_have_cpuid
-       .type   _gnutls_have_cpuid, @function
+.size  _gnutls_cpuid,.-_gnutls_cpuid
+.type  _gnutls_have_cpuid,@function
+.align 16
 _gnutls_have_cpuid:
-.LFB0:
-       .cfi_startproc
-       pushfl  
-       pop %eax        
-       orl $0x200000, %eax     
-       push %eax       
-       popfl   
-       pushfl  
-       pop %eax        
-       andl $0x200000, %eax    
+       pushfl
+       popl    %eax
+       orl     $2097152,%eax
+       pushl   %eax
+       popfl
+       pushfl
+       popl    %eax
+       andl    $2097152,%eax
        ret
-       .cfi_endproc
-.LFE0:
-       .size   _gnutls_have_cpuid, .-_gnutls_have_cpuid
+.size  _gnutls_have_cpuid,.-_gnutls_have_cpuid
+.byte  67,80,85,73,68,32,102,111,114,32,120,56,54,0
 
-#if defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
-#endif
diff --git a/lib/accelerated/x86/asm/padlock-x86-64.s 
b/lib/accelerated/x86/asm/padlock-x86-64.s
index 156fe38..020d6e5 100644
--- a/lib/accelerated/x86/asm/padlock-x86-64.s
+++ b/lib/accelerated/x86/asm/padlock-x86-64.s
@@ -514,6 +514,4 @@ padlock_cbc_encrypt:
 .Lpadlock_saved_context:
 .quad  0
 
-#if defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
-#endif
diff --git a/lib/accelerated/x86/asm/padlock-x86.s 
b/lib/accelerated/x86/asm/padlock-x86.s
index b2fca21..fc5f9ac 100644
--- a/lib/accelerated/x86/asm/padlock-x86.s
+++ b/lib/accelerated/x86/asm/padlock-x86.s
@@ -35,7 +35,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-.file  "padlock-x86.s"
+.file  "devel/perlasm/e_padlock-x86.s"
 .text
 .globl padlock_capability
 .type  padlock_capability,@function
@@ -620,6 +620,4 @@ padlock_sha512_blocks:
 .Lpadlock_saved_context:
 .long  0
 
-#if defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
-#endif


hooks/post-receive
-- 
GNU gnutls



reply via email to

[Prev in Thread] Current Thread [Next in Thread]