Remove trailing whitespace from Perl files.
Upstream did this in 609b0852e4d50251857dbbac3141ba042e35a9ae and it's easier to apply patches if we do also. Change-Id: I5142693ed1e26640987ff16f5ea510e81bba200e Reviewed-on: https://boringssl-review.googlesource.com/13771 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: David Benjamin <davidben@google.com>
This commit is contained in:
parent
073a06d3da
commit
c948d46569
@ -116,7 +116,7 @@
|
||||
# words every cache-line is *guaranteed* to be accessed within ~50
|
||||
# cycles window. Why just SSE? Because it's needed on hyper-threading
|
||||
# CPU! Which is also why it's prefetched with 64 byte stride. Best
|
||||
# part is that it has no negative effect on performance:-)
|
||||
# part is that it has no negative effect on performance:-)
|
||||
#
|
||||
# Version 4.3 implements switch between compact and non-compact block
|
||||
# functions in AES_cbc_encrypt depending on how much data was asked
|
||||
@ -578,7 +578,7 @@ sub enctransform()
|
||||
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
|
||||
# | mm4 | mm0 |
|
||||
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
|
||||
# | s3 | s2 | s1 | s0 |
|
||||
# | s3 | s2 | s1 | s0 |
|
||||
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
|
||||
# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
|
||||
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
|
||||
@ -798,7 +798,7 @@ sub encstep()
|
||||
|
||||
if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
|
||||
elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
|
||||
else { &mov ($tmp,$s[3]);
|
||||
else { &mov ($tmp,$s[3]);
|
||||
&shr ($tmp,24) }
|
||||
&xor ($out,&DWP(1,$te,$tmp,8));
|
||||
if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
|
||||
@ -1551,7 +1551,7 @@ sub sse_deccompact()
|
||||
&pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4
|
||||
&pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);
|
||||
&pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4
|
||||
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
|
||||
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
|
||||
|
||||
&pxor ("mm3","mm3"); &pxor ("mm7","mm7");
|
||||
&pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
|
||||
@ -2021,7 +2021,7 @@ sub declast()
|
||||
{
|
||||
# stack frame layout
|
||||
# -4(%esp) # return address 0(%esp)
|
||||
# 0(%esp) # s0 backing store 4(%esp)
|
||||
# 0(%esp) # s0 backing store 4(%esp)
|
||||
# 4(%esp) # s1 backing store 8(%esp)
|
||||
# 8(%esp) # s2 backing store 12(%esp)
|
||||
# 12(%esp) # s3 backing store 16(%esp)
|
||||
@ -2731,7 +2731,7 @@ sub enckey()
|
||||
&mov (&DWP(80,"edi"),10); # setup number of rounds
|
||||
&xor ("eax","eax");
|
||||
&jmp (&label("exit"));
|
||||
|
||||
|
||||
&set_label("12rounds");
|
||||
&mov ("eax",&DWP(0,"esi")); # copy first 6 dwords
|
||||
&mov ("ebx",&DWP(4,"esi"));
|
||||
|
||||
@ -1286,7 +1286,7 @@ $code.=<<___;
|
||||
asm_AES_set_encrypt_key:
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12 # redundant, but allows to share
|
||||
push %r12 # redundant, but allows to share
|
||||
push %r13 # exception handler...
|
||||
push %r14
|
||||
push %r15
|
||||
@ -1412,7 +1412,7 @@ $code.=<<___;
|
||||
xor %rax,%rax
|
||||
jmp .Lexit
|
||||
|
||||
.L14rounds:
|
||||
.L14rounds:
|
||||
mov 0(%rsi),%rax # copy first 8 dwords
|
||||
mov 8(%rsi),%rbx
|
||||
mov 16(%rsi),%rcx
|
||||
|
||||
@ -1040,7 +1040,7 @@ if ($PREFIX eq "aesni") {
|
||||
&set_label("ctr32_one_shortcut",16);
|
||||
&movups ($inout0,&QWP(0,$rounds_)); # load ivec
|
||||
&mov ($rounds,&DWP(240,$key));
|
||||
|
||||
|
||||
&set_label("ctr32_one");
|
||||
if ($inline)
|
||||
{ &aesni_inline_generate1("enc"); }
|
||||
|
||||
@ -27,7 +27,7 @@
|
||||
# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
|
||||
# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
|
||||
# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
|
||||
# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
|
||||
# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
|
||||
# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
|
||||
# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
|
||||
#
|
||||
@ -111,7 +111,7 @@
|
||||
# performance is achieved by interleaving instructions working on
|
||||
# independent blocks. In which case asymptotic limit for such modes
|
||||
# can be obtained by dividing above mentioned numbers by AES
|
||||
# instructions' interleave factor. Westmere can execute at most 3
|
||||
# instructions' interleave factor. Westmere can execute at most 3
|
||||
# instructions at a time, meaning that optimal interleave factor is 3,
|
||||
# and that's where the "magic" number of 1.25 come from. "Optimal
|
||||
# interleave factor" means that increase of interleave factor does
|
||||
@ -299,7 +299,7 @@ ___
|
||||
# on 2x subroutine on Atom Silvermont account. For processors that
|
||||
# can schedule aes[enc|dec] every cycle optimal interleave factor
|
||||
# equals to corresponding instructions latency. 8x is optimal for
|
||||
# * Bridge and "super-optimal" for other Intel CPUs...
|
||||
# * Bridge and "super-optimal" for other Intel CPUs...
|
||||
|
||||
sub aesni_generate2 {
|
||||
my $dir=shift;
|
||||
@ -1258,7 +1258,7 @@ $code.=<<___;
|
||||
lea 7($ctr),%r9
|
||||
mov %r10d,0x60+12(%rsp)
|
||||
bswap %r9d
|
||||
mov OPENSSL_ia32cap_P+4(%rip),%r10d
|
||||
mov OPENSSL_ia32cap_P+4(%rip),%r10d
|
||||
xor $key0,%r9d
|
||||
and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
|
||||
mov %r9d,0x70+12(%rsp)
|
||||
@ -1538,7 +1538,7 @@ $code.=<<___;
|
||||
|
||||
.Lctr32_tail:
|
||||
# note that at this point $inout0..5 are populated with
|
||||
# counter values xor-ed with 0-round key
|
||||
# counter values xor-ed with 0-round key
|
||||
lea 16($key),$key
|
||||
cmp \$4,$len
|
||||
jb .Lctr32_loop3
|
||||
|
||||
@ -957,21 +957,21 @@ if ($flavour =~ /64/) { ######## 64-bit code
|
||||
|
||||
$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
|
||||
sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
|
||||
"vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
|
||||
"vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
|
||||
}
|
||||
|
||||
sub unvdup32 {
|
||||
my $arg=shift;
|
||||
|
||||
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
|
||||
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
|
||||
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
|
||||
}
|
||||
|
||||
sub unvmov32 {
|
||||
my $arg=shift;
|
||||
|
||||
$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
|
||||
sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
|
||||
sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
|
||||
}
|
||||
|
||||
foreach(split("\n",$code)) {
|
||||
|
||||
@ -84,7 +84,7 @@ my @s=@_[12..15];
|
||||
|
||||
sub InBasisChange {
|
||||
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
|
||||
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
|
||||
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
|
||||
my @b=@_[0..7];
|
||||
$code.=<<___;
|
||||
veor @b[2], @b[2], @b[1]
|
||||
|
||||
@ -122,7 +122,7 @@ my @s=@_[12..15];
|
||||
|
||||
sub InBasisChange {
|
||||
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
|
||||
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
|
||||
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
|
||||
my @b=@_[0..7];
|
||||
$code.=<<___;
|
||||
pxor @b[6], @b[5]
|
||||
@ -372,7 +372,7 @@ $code.=<<___;
|
||||
pxor @s[0], @t[3]
|
||||
pxor @s[1], @t[2]
|
||||
pxor @s[2], @t[1]
|
||||
pxor @s[3], @t[0]
|
||||
pxor @s[3], @t[0]
|
||||
|
||||
#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
|
||||
|
||||
|
||||
@ -438,7 +438,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
|
||||
##
|
||||
&set_label("schedule_192",16);
|
||||
&movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
|
||||
&call ("_vpaes_schedule_transform"); # input transform
|
||||
&call ("_vpaes_schedule_transform"); # input transform
|
||||
&movdqa ("xmm6","xmm0"); # save short part
|
||||
&pxor ("xmm4","xmm4"); # clear 4
|
||||
&movhlps("xmm6","xmm4"); # clobber low side with zeros
|
||||
@ -469,7 +469,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
|
||||
##
|
||||
&set_label("schedule_256",16);
|
||||
&movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
|
||||
&call ("_vpaes_schedule_transform"); # input transform
|
||||
&call ("_vpaes_schedule_transform"); # input transform
|
||||
&mov ($round,7);
|
||||
|
||||
&set_label("loop_schedule_256");
|
||||
@ -480,7 +480,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
|
||||
&call ("_vpaes_schedule_round");
|
||||
&dec ($round);
|
||||
&jz (&label("schedule_mangle_last"));
|
||||
&call ("_vpaes_schedule_mangle");
|
||||
&call ("_vpaes_schedule_mangle");
|
||||
|
||||
# low round. swap xmm7 and xmm6
|
||||
&pshufd ("xmm0","xmm0",0xFF);
|
||||
@ -603,7 +603,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
|
||||
# subbyte
|
||||
&movdqa ("xmm4",&QWP($k_s0F,$const));
|
||||
&movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
|
||||
&movdqa ("xmm1","xmm4");
|
||||
&movdqa ("xmm1","xmm4");
|
||||
&pandn ("xmm1","xmm0");
|
||||
&psrld ("xmm1",4); # 1 = i
|
||||
&pand ("xmm0","xmm4"); # 0 = k
|
||||
|
||||
@ -164,7 +164,7 @@ _vpaes_encrypt_core:
|
||||
pshufb %xmm1, %xmm0
|
||||
ret
|
||||
.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
|
||||
|
||||
|
||||
##
|
||||
## Decryption core
|
||||
##
|
||||
@ -325,7 +325,7 @@ _vpaes_schedule_core:
|
||||
##
|
||||
.Lschedule_128:
|
||||
mov \$10, %esi
|
||||
|
||||
|
||||
.Loop_schedule_128:
|
||||
call _vpaes_schedule_round
|
||||
dec %rsi
|
||||
@ -359,7 +359,7 @@ _vpaes_schedule_core:
|
||||
|
||||
.Loop_schedule_192:
|
||||
call _vpaes_schedule_round
|
||||
palignr \$8,%xmm6,%xmm0
|
||||
palignr \$8,%xmm6,%xmm0
|
||||
call _vpaes_schedule_mangle # save key n
|
||||
call _vpaes_schedule_192_smear
|
||||
call _vpaes_schedule_mangle # save key n+1
|
||||
@ -385,7 +385,7 @@ _vpaes_schedule_core:
|
||||
movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
|
||||
call _vpaes_schedule_transform # input transform
|
||||
mov \$7, %esi
|
||||
|
||||
|
||||
.Loop_schedule_256:
|
||||
call _vpaes_schedule_mangle # output low result
|
||||
movdqa %xmm0, %xmm6 # save cur_lo in xmm6
|
||||
@ -394,7 +394,7 @@ _vpaes_schedule_core:
|
||||
call _vpaes_schedule_round
|
||||
dec %rsi
|
||||
jz .Lschedule_mangle_last
|
||||
call _vpaes_schedule_mangle
|
||||
call _vpaes_schedule_mangle
|
||||
|
||||
# low round. swap xmm7 and xmm6
|
||||
pshufd \$0xFF, %xmm0, %xmm0
|
||||
@ -402,10 +402,10 @@ _vpaes_schedule_core:
|
||||
movdqa %xmm6, %xmm7
|
||||
call _vpaes_schedule_low_round
|
||||
movdqa %xmm5, %xmm7
|
||||
|
||||
|
||||
jmp .Loop_schedule_256
|
||||
|
||||
|
||||
|
||||
##
|
||||
## .aes_schedule_mangle_last
|
||||
##
|
||||
@ -504,9 +504,9 @@ _vpaes_schedule_round:
|
||||
# rotate
|
||||
pshufd \$0xFF, %xmm0, %xmm0
|
||||
palignr \$1, %xmm0, %xmm0
|
||||
|
||||
|
||||
# fall through...
|
||||
|
||||
|
||||
# low round: same as high round, but no rotation and no rcon.
|
||||
_vpaes_schedule_low_round:
|
||||
# smear xmm7
|
||||
@ -545,7 +545,7 @@ _vpaes_schedule_low_round:
|
||||
pxor %xmm4, %xmm0 # 0 = sbox output
|
||||
|
||||
# add in smeared stuff
|
||||
pxor %xmm7, %xmm0
|
||||
pxor %xmm7, %xmm0
|
||||
movdqa %xmm0, %xmm7
|
||||
ret
|
||||
.size _vpaes_schedule_round,.-_vpaes_schedule_round
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
# [depending on key length, less for longer keys] on ARM920T, and
|
||||
# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
|
||||
# base and compiler generated code with in-lined umull and even umlal
|
||||
# instructions. The latter means that this code didn't really have an
|
||||
# instructions. The latter means that this code didn't really have an
|
||||
# "advantage" of utilizing some "secret" instruction.
|
||||
#
|
||||
# The code is interoperable with Thumb ISA and is rather compact, less
|
||||
|
||||
@ -47,7 +47,7 @@ sub bn_mul_add_words
|
||||
&movd("mm0",&wparam(3)); # mm0 = w
|
||||
&pxor("mm1","mm1"); # mm1 = carry_in
|
||||
&jmp(&label("maw_sse2_entry"));
|
||||
|
||||
|
||||
&set_label("maw_sse2_unrolled",16);
|
||||
&movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
|
||||
&paddq("mm1","mm3"); # mm1 = carry_in + r[0]
|
||||
@ -668,20 +668,20 @@ sub bn_sub_part_words
|
||||
&adc($c,0);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
}
|
||||
|
||||
|
||||
&comment("");
|
||||
&add($b,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("pw_neg_loop"));
|
||||
|
||||
|
||||
&set_label("pw_neg_finish",0);
|
||||
&mov($tmp2,&wparam(4)); # get dl
|
||||
&mov($num,0);
|
||||
&sub($num,$tmp2);
|
||||
&and($num,7);
|
||||
&jz(&label("pw_end"));
|
||||
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("dl<0 Tail Round $i");
|
||||
@ -698,9 +698,9 @@ sub bn_sub_part_words
|
||||
}
|
||||
|
||||
&jmp(&label("pw_end"));
|
||||
|
||||
|
||||
&set_label("pw_pos",0);
|
||||
|
||||
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
&jz(&label("pw_pos_finish"));
|
||||
|
||||
@ -715,18 +715,18 @@ sub bn_sub_part_words
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
&jnc(&label("pw_nc".$i));
|
||||
}
|
||||
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("pw_pos_loop"));
|
||||
|
||||
|
||||
&set_label("pw_pos_finish",0);
|
||||
&mov($num,&wparam(4)); # get dl
|
||||
&and($num,7);
|
||||
&jz(&label("pw_end"));
|
||||
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("dl>0 Tail Round $i");
|
||||
@ -747,17 +747,17 @@ sub bn_sub_part_words
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
&set_label("pw_nc".$i,0);
|
||||
}
|
||||
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("pw_nc_loop"));
|
||||
|
||||
|
||||
&mov($num,&wparam(4)); # get dl
|
||||
&and($num,7);
|
||||
&jz(&label("pw_nc_end"));
|
||||
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
|
||||
@ -41,7 +41,7 @@ sub mul_add_c
|
||||
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
|
||||
###
|
||||
&adc($c2,0);
|
||||
# is pos > 1, it means it is the last loop
|
||||
# is pos > 1, it means it is the last loop
|
||||
&mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
|
||||
}
|
||||
@ -70,7 +70,7 @@ sub sqr_add_c
|
||||
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
|
||||
###
|
||||
&adc($c2,0);
|
||||
# is pos > 1, it means it is the last loop
|
||||
# is pos > 1, it means it is the last loop
|
||||
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
|
||||
}
|
||||
@ -121,7 +121,7 @@ sub bn_mul_comba
|
||||
$c2="ebp";
|
||||
$a="esi";
|
||||
$b="edi";
|
||||
|
||||
|
||||
$as=0;
|
||||
$ae=0;
|
||||
$bs=0;
|
||||
@ -136,9 +136,9 @@ sub bn_mul_comba
|
||||
&push("ebx");
|
||||
|
||||
&xor($c0,$c0);
|
||||
&mov("eax",&DWP(0,$a,"",0)); # load the first word
|
||||
&mov("eax",&DWP(0,$a,"",0)); # load the first word
|
||||
&xor($c1,$c1);
|
||||
&mov("edx",&DWP(0,$b,"",0)); # load the first second
|
||||
&mov("edx",&DWP(0,$b,"",0)); # load the first second
|
||||
|
||||
for ($i=0; $i<$tot; $i++)
|
||||
{
|
||||
@ -146,7 +146,7 @@ sub bn_mul_comba
|
||||
$bi=$bs;
|
||||
$end=$be+1;
|
||||
|
||||
&comment("################## Calculate word $i");
|
||||
&comment("################## Calculate word $i");
|
||||
|
||||
for ($j=$bs; $j<$end; $j++)
|
||||
{
|
||||
|
||||
@ -359,7 +359,7 @@ $code.=<<___;
|
||||
vpaddq $TEMP1, $ACC1, $ACC1
|
||||
vpmuludq 32*7-128($aap), $B2, $ACC2
|
||||
vpbroadcastq 32*5-128($tpa), $B2
|
||||
vpaddq 32*11-448($tp1), $ACC2, $ACC2
|
||||
vpaddq 32*11-448($tp1), $ACC2, $ACC2
|
||||
|
||||
vmovdqu $ACC6, 32*6-192($tp0)
|
||||
vmovdqu $ACC7, 32*7-192($tp0)
|
||||
@ -418,7 +418,7 @@ $code.=<<___;
|
||||
vmovdqu $ACC7, 32*16-448($tp1)
|
||||
lea 8($tp1), $tp1
|
||||
|
||||
dec $i
|
||||
dec $i
|
||||
jnz .LOOP_SQR_1024
|
||||
___
|
||||
$ZERO = $ACC9;
|
||||
@ -763,7 +763,7 @@ $code.=<<___;
|
||||
vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
|
||||
vpaddq $TEMP3, $ACC7, $ACC7
|
||||
vpaddq $TEMP4, $ACC8, $ACC8
|
||||
|
||||
|
||||
vpsrlq \$29, $ACC4, $TEMP1
|
||||
vpand $AND_MASK, $ACC4, $ACC4
|
||||
vpsrlq \$29, $ACC5, $TEMP2
|
||||
@ -1429,7 +1429,7 @@ $code.=<<___;
|
||||
vpaddq $TEMP4, $ACC8, $ACC8
|
||||
|
||||
vmovdqu $ACC4, 128-128($rp)
|
||||
vmovdqu $ACC5, 160-128($rp)
|
||||
vmovdqu $ACC5, 160-128($rp)
|
||||
vmovdqu $ACC6, 192-128($rp)
|
||||
vmovdqu $ACC7, 224-128($rp)
|
||||
vmovdqu $ACC8, 256-128($rp)
|
||||
|
||||
@ -32,7 +32,7 @@ require "x86asm.pl";
|
||||
|
||||
$output = pop;
|
||||
open STDOUT,">$output";
|
||||
|
||||
|
||||
&asm_init($ARGV[0],$0);
|
||||
|
||||
$sse2=0;
|
||||
|
||||
@ -1010,7 +1010,7 @@ my $bptr="%rdx"; # const void *table,
|
||||
my $nptr="%rcx"; # const BN_ULONG *nptr,
|
||||
my $n0 ="%r8"; # const BN_ULONG *n0);
|
||||
my $num ="%r9"; # int num, has to be divisible by 8
|
||||
# int pwr
|
||||
# int pwr
|
||||
|
||||
my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
|
||||
my @A0=("%r10","%r11");
|
||||
@ -1078,7 +1078,7 @@ $code.=<<___;
|
||||
.byte 0x2e # predict non-taken
|
||||
jnc .Lpwr_page_walk
|
||||
|
||||
mov $num,%r10
|
||||
mov $num,%r10
|
||||
neg $num
|
||||
|
||||
##############################################################
|
||||
@ -1987,7 +1987,7 @@ __bn_post4x_internal:
|
||||
jnz .Lsqr4x_sub
|
||||
|
||||
mov $num,%r10 # prepare for back-to-back call
|
||||
neg $num # restore $num
|
||||
neg $num # restore $num
|
||||
ret
|
||||
.size __bn_post4x_internal,.-__bn_post4x_internal
|
||||
___
|
||||
@ -2199,7 +2199,7 @@ bn_mulx4x_mont_gather5:
|
||||
mov \$0,%r10
|
||||
cmovc %r10,%r11
|
||||
sub %r11,%rsp
|
||||
.Lmulx4xsp_done:
|
||||
.Lmulx4xsp_done:
|
||||
and \$-64,%rsp # ensure alignment
|
||||
mov %rax,%r11
|
||||
sub %rsp,%r11
|
||||
@ -2665,7 +2665,7 @@ bn_powerx5:
|
||||
.byte 0x2e # predict non-taken
|
||||
jnc .Lpwrx_page_walk
|
||||
|
||||
mov $num,%r10
|
||||
mov $num,%r10
|
||||
neg $num
|
||||
|
||||
##############################################################
|
||||
|
||||
@ -8,7 +8,7 @@
|
||||
# ====================================================================
|
||||
#
|
||||
# December 2014
|
||||
#
|
||||
#
|
||||
# ChaCha20 for ARMv4.
|
||||
#
|
||||
# Performance in cycles per byte out of large buffer.
|
||||
@ -713,7 +713,7 @@ ChaCha20_neon:
|
||||
vadd.i32 $d2,$d1,$t0 @ counter+2
|
||||
str @t[3], [sp,#4*(16+15)]
|
||||
mov @t[3],#10
|
||||
add @x[12],@x[12],#3 @ counter+3
|
||||
add @x[12],@x[12],#3 @ counter+3
|
||||
b .Loop_neon
|
||||
|
||||
.align 4
|
||||
|
||||
@ -8,7 +8,7 @@
|
||||
# ====================================================================
|
||||
#
|
||||
# June 2015
|
||||
#
|
||||
#
|
||||
# ChaCha20 for ARMv8.
|
||||
#
|
||||
# Performance in cycles per byte out of large buffer.
|
||||
@ -193,7 +193,7 @@ ChaCha20_ctr32:
|
||||
mov $ctr,#10
|
||||
subs $len,$len,#64
|
||||
.Loop:
|
||||
sub $ctr,$ctr,#1
|
||||
sub $ctr,$ctr,#1
|
||||
___
|
||||
foreach (&ROUND(0, 4, 8,12)) { eval; }
|
||||
foreach (&ROUND(0, 5,10,15)) { eval; }
|
||||
|
||||
@ -289,7 +289,7 @@ __ecp_nistz256_mul_montq:
|
||||
adc \$0, $acc0
|
||||
|
||||
########################################################################
|
||||
# Second reduction step
|
||||
# Second reduction step
|
||||
mov $acc1, $t1
|
||||
shl \$32, $acc1
|
||||
mulq $poly3
|
||||
@ -336,7 +336,7 @@ __ecp_nistz256_mul_montq:
|
||||
adc \$0, $acc1
|
||||
|
||||
########################################################################
|
||||
# Third reduction step
|
||||
# Third reduction step
|
||||
mov $acc2, $t1
|
||||
shl \$32, $acc2
|
||||
mulq $poly3
|
||||
@ -383,7 +383,7 @@ __ecp_nistz256_mul_montq:
|
||||
adc \$0, $acc2
|
||||
|
||||
########################################################################
|
||||
# Final reduction step
|
||||
# Final reduction step
|
||||
mov $acc3, $t1
|
||||
shl \$32, $acc3
|
||||
mulq $poly3
|
||||
@ -396,7 +396,7 @@ __ecp_nistz256_mul_montq:
|
||||
mov $acc5, $t1
|
||||
adc \$0, $acc2
|
||||
|
||||
########################################################################
|
||||
########################################################################
|
||||
# Branch-less conditional subtraction of P
|
||||
sub \$-1, $acc4 # .Lpoly[0]
|
||||
mov $acc0, $t2
|
||||
@ -1649,7 +1649,7 @@ $code.=<<___;
|
||||
movq %xmm1, $r_ptr
|
||||
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S);
|
||||
___
|
||||
{
|
||||
{
|
||||
######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
|
||||
# operate in 4-5-6-7 "name space" that matches squaring output
|
||||
#
|
||||
@ -1738,7 +1738,7 @@ $code.=<<___;
|
||||
lea $M(%rsp), $b_ptr
|
||||
mov $acc4, $acc6 # harmonize sub output and mul input
|
||||
xor %ecx, %ecx
|
||||
mov $acc4, $S+8*0(%rsp) # have to save:-(
|
||||
mov $acc4, $S+8*0(%rsp) # have to save:-(
|
||||
mov $acc5, $acc2
|
||||
mov $acc5, $S+8*1(%rsp)
|
||||
cmovz $acc0, $acc3
|
||||
|
||||
@ -50,7 +50,7 @@ sub R0
|
||||
local($pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_;
|
||||
|
||||
&mov($tmp1,$C) if $pos < 0;
|
||||
&mov($tmp2,&DWP($xo[$ki]*4,$K,"",0)) if $pos < 0; # very first one
|
||||
&mov($tmp2,&DWP($xo[$ki]*4,$K,"",0)) if $pos < 0; # very first one
|
||||
|
||||
# body proper
|
||||
|
||||
|
||||
@ -47,7 +47,7 @@
|
||||
#
|
||||
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
|
||||
# Polynomial Multiplication on ARM Processors using the NEON Engine.
|
||||
#
|
||||
#
|
||||
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
|
||||
|
||||
# ====================================================================
|
||||
@ -486,7 +486,7 @@ $code.=<<___;
|
||||
#ifdef __ARMEL__
|
||||
vrev64.8 $Xl,$Xl
|
||||
#endif
|
||||
sub $Xi,#16
|
||||
sub $Xi,#16
|
||||
vst1.64 $Xl#hi,[$Xi]! @ write out Xi
|
||||
vst1.64 $Xl#lo,[$Xi]
|
||||
|
||||
|
||||
@ -576,7 +576,7 @@ sub mmx_loop() {
|
||||
&bswap ($dat);
|
||||
&pshufw ($Zhi,$Zhi,0b00011011); # 76543210
|
||||
&bswap ("ebx");
|
||||
|
||||
|
||||
&cmp ("ecx",&DWP(528+16+8,"esp")); # are we done?
|
||||
&jne (&label("outer"));
|
||||
}
|
||||
@ -680,7 +680,7 @@ my ($Xhi,$Xi) = @_;
|
||||
&psllq ($Xi,57); #
|
||||
&movdqa ($T1,$Xi); #
|
||||
&pslldq ($Xi,8);
|
||||
&psrldq ($T1,8); #
|
||||
&psrldq ($T1,8); #
|
||||
&pxor ($Xi,$T2);
|
||||
&pxor ($Xhi,$T1); #
|
||||
|
||||
@ -850,7 +850,7 @@ my ($Xhi,$Xi) = @_;
|
||||
&psllq ($Xi,57); #
|
||||
&movdqa ($T1,$Xi); #
|
||||
&pslldq ($Xi,8);
|
||||
&psrldq ($T1,8); #
|
||||
&psrldq ($T1,8); #
|
||||
&pxor ($Xi,$T2);
|
||||
&pxor ($Xhi,$T1); #
|
||||
&pshufd ($T1,$Xhn,0b01001110);
|
||||
|
||||
@ -449,7 +449,7 @@ $code.=<<___;
|
||||
psllq \$57,$Xi #
|
||||
movdqa $Xi,$T1 #
|
||||
pslldq \$8,$Xi
|
||||
psrldq \$8,$T1 #
|
||||
psrldq \$8,$T1 #
|
||||
pxor $T2,$Xi
|
||||
pxor $T1,$Xhi #
|
||||
|
||||
@ -563,7 +563,7 @@ ___
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
|
||||
$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
|
||||
# experimental alternative. special thing about is that there
|
||||
# no dependency between the two multiplications...
|
||||
# no dependency between the two multiplications...
|
||||
mov \$`0xE1<<1`,%eax
|
||||
mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
|
||||
mov \$0x07,%r11d
|
||||
@ -738,7 +738,7 @@ $code.=<<___;
|
||||
movdqa $T2,$T1 #
|
||||
pslldq \$8,$T2
|
||||
pclmulqdq \$0x00,$Hkey2,$Xln
|
||||
psrldq \$8,$T1 #
|
||||
psrldq \$8,$T1 #
|
||||
pxor $T2,$Xi
|
||||
pxor $T1,$Xhi #
|
||||
movdqu 0($inp),$T1
|
||||
@ -874,7 +874,7 @@ $code.=<<___;
|
||||
psllq \$57,$Xi #
|
||||
movdqa $Xi,$T1 #
|
||||
pslldq \$8,$Xi
|
||||
psrldq \$8,$T1 #
|
||||
psrldq \$8,$T1 #
|
||||
pxor $T2,$Xi
|
||||
pshufd \$0b01001110,$Xhn,$Xmn
|
||||
pxor $T1,$Xhi #
|
||||
|
||||
@ -36,7 +36,7 @@ my $globl = sub {
|
||||
my $ret;
|
||||
|
||||
$name =~ s|^\.||;
|
||||
|
||||
|
||||
SWITCH: for ($flavour) {
|
||||
/aix/ && do { if (!$$type) {
|
||||
$$type = "\@function";
|
||||
|
||||
@ -140,7 +140,7 @@ ___
|
||||
grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out;
|
||||
push (@out,$comm)
|
||||
}
|
||||
push (@out,$initseg) if ($initseg);
|
||||
push (@out,$initseg) if ($initseg);
|
||||
}
|
||||
|
||||
sub ::comment { foreach (@_) { push(@out,"\t; $_\n"); } }
|
||||
|
||||
@ -246,7 +246,7 @@ sha1_block_data_order:
|
||||
jz .Lialu
|
||||
___
|
||||
$code.=<<___ if ($shaext);
|
||||
test \$`1<<29`,%r10d # check SHA bit
|
||||
test \$`1<<29`,%r10d # check SHA bit
|
||||
jnz _shaext_shortcut
|
||||
___
|
||||
$code.=<<___ if ($avx>1);
|
||||
|
||||
@ -40,7 +40,7 @@
|
||||
#
|
||||
# Performance in clock cycles per processed byte (less is better):
|
||||
#
|
||||
# gcc icc x86 asm(*) SIMD x86_64 asm(**)
|
||||
# gcc icc x86 asm(*) SIMD x86_64 asm(**)
|
||||
# Pentium 46 57 40/38 - -
|
||||
# PIII 36 33 27/24 - -
|
||||
# P4 41 38 28 - 17.3
|
||||
@ -263,7 +263,7 @@ my $suffix=shift;
|
||||
&mov ($Coff,"ecx");
|
||||
&mov ($Doff,"edi");
|
||||
&mov (&DWP(0,"esp"),"ebx"); # magic
|
||||
&mov ($E,&DWP(16,"esi"));
|
||||
&mov ($E,&DWP(16,"esi"));
|
||||
&mov ("ebx",&DWP(20,"esi"));
|
||||
&mov ("ecx",&DWP(24,"esi"));
|
||||
&mov ("edi",&DWP(28,"esi"));
|
||||
@ -372,7 +372,7 @@ my @AH=($A,$K256);
|
||||
&xor ($AH[1],"ecx"); # magic
|
||||
&mov (&DWP(8,"esp"),"ecx");
|
||||
&mov (&DWP(12,"esp"),"ebx");
|
||||
&mov ($E,&DWP(16,"esi"));
|
||||
&mov ($E,&DWP(16,"esi"));
|
||||
&mov ("ebx",&DWP(20,"esi"));
|
||||
&mov ("ecx",&DWP(24,"esi"));
|
||||
&mov ("esi",&DWP(28,"esi"));
|
||||
|
||||
@ -376,7 +376,7 @@ if ($sse2) {
|
||||
|
||||
&set_label("16_79_sse2",16);
|
||||
for ($j=0;$j<2;$j++) { # 2x unroll
|
||||
#&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15
|
||||
#&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15
|
||||
&movq ("mm5",&QWP(8*(9+16-14),"esp"));
|
||||
&movq ("mm1","mm7");
|
||||
&psrlq ("mm7",1);
|
||||
|
||||
@ -18,7 +18,7 @@
|
||||
# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
|
||||
# Denver 2.01 10.5 (+26%) 6.70 (+8%)
|
||||
# X-Gene 20.0 (+100%) 12.8 (+300%(***))
|
||||
#
|
||||
#
|
||||
# (*) Software SHA256 results are of lesser relevance, presented
|
||||
# mostly for informational purposes.
|
||||
# (**) The result is a trade-off: it's possible to improve it by
|
||||
|
||||
@ -1766,7 +1766,7 @@ if ($avx>1) {{
|
||||
######################################################################
|
||||
# AVX2+BMI code path
|
||||
#
|
||||
my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
|
||||
my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
|
||||
my $PUSH8=8*2*$SZ;
|
||||
use integer;
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user