Remove trailing whitespace from Perl files.

Upstream did this in 609b0852e4d50251857dbbac3141ba042e35a9ae and it's
easier to apply patches if we do also.

Change-Id: I5142693ed1e26640987ff16f5ea510e81bba200e
Reviewed-on: https://boringssl-review.googlesource.com/13771
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>
This commit is contained in:
Adam Langley 2017-02-09 12:21:08 -08:00 committed by David Benjamin
parent 073a06d3da
commit c948d46569
29 changed files with 92 additions and 92 deletions

View File

@ -116,7 +116,7 @@
# words every cache-line is *guaranteed* to be accessed within ~50
# cycles window. Why just SSE? Because it's needed on hyper-threading
# CPU! Which is also why it's prefetched with 64 byte stride. Best
# part is that it has no negative effect on performance:-)
# part is that it has no negative effect on performance:-)
#
# Version 4.3 implements switch between compact and non-compact block
# functions in AES_cbc_encrypt depending on how much data was asked
@ -578,7 +578,7 @@ sub enctransform()
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
# | mm4 | mm0 |
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
# | s3 | s2 | s1 | s0 |
# | s3 | s2 | s1 | s0 |
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
@ -798,7 +798,7 @@ sub encstep()
if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
else { &mov ($tmp,$s[3]);
else { &mov ($tmp,$s[3]);
&shr ($tmp,24) }
&xor ($out,&DWP(1,$te,$tmp,8));
if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
@ -1551,7 +1551,7 @@ sub sse_deccompact()
&pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4
&pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);
&pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
&pxor ("mm3","mm3"); &pxor ("mm7","mm7");
&pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
@ -2021,7 +2021,7 @@ sub declast()
{
# stack frame layout
# -4(%esp) # return address 0(%esp)
# 0(%esp) # s0 backing store 4(%esp)
# 0(%esp) # s0 backing store 4(%esp)
# 4(%esp) # s1 backing store 8(%esp)
# 8(%esp) # s2 backing store 12(%esp)
# 12(%esp) # s3 backing store 16(%esp)
@ -2731,7 +2731,7 @@ sub enckey()
&mov (&DWP(80,"edi"),10); # setup number of rounds
&xor ("eax","eax");
&jmp (&label("exit"));
&set_label("12rounds");
&mov ("eax",&DWP(0,"esi")); # copy first 6 dwords
&mov ("ebx",&DWP(4,"esi"));

View File

@ -1286,7 +1286,7 @@ $code.=<<___;
asm_AES_set_encrypt_key:
push %rbx
push %rbp
push %r12 # redundant, but allows to share
push %r12 # redundant, but allows to share
push %r13 # exception handler...
push %r14
push %r15
@ -1412,7 +1412,7 @@ $code.=<<___;
xor %rax,%rax
jmp .Lexit
.L14rounds:
.L14rounds:
mov 0(%rsi),%rax # copy first 8 dwords
mov 8(%rsi),%rbx
mov 16(%rsi),%rcx

View File

@ -1040,7 +1040,7 @@ if ($PREFIX eq "aesni") {
&set_label("ctr32_one_shortcut",16);
&movups ($inout0,&QWP(0,$rounds_)); # load ivec
&mov ($rounds,&DWP(240,$key));
&set_label("ctr32_one");
if ($inline)
{ &aesni_inline_generate1("enc"); }

View File

@ -27,7 +27,7 @@
# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
#
@ -111,7 +111,7 @@
# performance is achieved by interleaving instructions working on
# independent blocks. In which case asymptotic limit for such modes
# can be obtained by dividing above mentioned numbers by AES
# instructions' interleave factor. Westmere can execute at most 3
# instructions' interleave factor. Westmere can execute at most 3
# instructions at a time, meaning that optimal interleave factor is 3,
# and that's where the "magic" number of 1.25 come from. "Optimal
# interleave factor" means that increase of interleave factor does
@ -299,7 +299,7 @@ ___
# on 2x subroutine on Atom Silvermont account. For processors that
# can schedule aes[enc|dec] every cycle optimal interleave factor
# equals to corresponding instructions latency. 8x is optimal for
# * Bridge and "super-optimal" for other Intel CPUs...
# * Bridge and "super-optimal" for other Intel CPUs...
sub aesni_generate2 {
my $dir=shift;
@ -1258,7 +1258,7 @@ $code.=<<___;
lea 7($ctr),%r9
mov %r10d,0x60+12(%rsp)
bswap %r9d
mov OPENSSL_ia32cap_P+4(%rip),%r10d
mov OPENSSL_ia32cap_P+4(%rip),%r10d
xor $key0,%r9d
and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
mov %r9d,0x70+12(%rsp)
@ -1538,7 +1538,7 @@ $code.=<<___;
.Lctr32_tail:
# note that at this point $inout0..5 are populated with
# counter values xor-ed with 0-round key
# counter values xor-ed with 0-round key
lea 16($key),$key
cmp \$4,$len
jb .Lctr32_loop3

View File

@ -957,21 +957,21 @@ if ($flavour =~ /64/) { ######## 64-bit code
$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
"vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
"vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
}
sub unvdup32 {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
}
sub unvmov32 {
my $arg=shift;
$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
}
foreach(split("\n",$code)) {

View File

@ -84,7 +84,7 @@ my @s=@_[12..15];
sub InBasisChange {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
my @b=@_[0..7];
$code.=<<___;
veor @b[2], @b[2], @b[1]

View File

@ -122,7 +122,7 @@ my @s=@_[12..15];
sub InBasisChange {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
my @b=@_[0..7];
$code.=<<___;
pxor @b[6], @b[5]
@ -372,7 +372,7 @@ $code.=<<___;
pxor @s[0], @t[3]
pxor @s[1], @t[2]
pxor @s[2], @t[1]
pxor @s[3], @t[0]
pxor @s[3], @t[0]
#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3

View File

@ -438,7 +438,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
##
&set_label("schedule_192",16);
&movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
&call ("_vpaes_schedule_transform"); # input transform
&call ("_vpaes_schedule_transform"); # input transform
&movdqa ("xmm6","xmm0"); # save short part
&pxor ("xmm4","xmm4"); # clear 4
&movhlps("xmm6","xmm4"); # clobber low side with zeros
@ -469,7 +469,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
##
&set_label("schedule_256",16);
&movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
&call ("_vpaes_schedule_transform"); # input transform
&call ("_vpaes_schedule_transform"); # input transform
&mov ($round,7);
&set_label("loop_schedule_256");
@ -480,7 +480,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
&call ("_vpaes_schedule_mangle");
&call ("_vpaes_schedule_mangle");
# low round. swap xmm7 and xmm6
&pshufd ("xmm0","xmm0",0xFF);
@ -603,7 +603,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
# subbyte
&movdqa ("xmm4",&QWP($k_s0F,$const));
&movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
&movdqa ("xmm1","xmm4");
&movdqa ("xmm1","xmm4");
&pandn ("xmm1","xmm0");
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm4"); # 0 = k

View File

@ -164,7 +164,7 @@ _vpaes_encrypt_core:
pshufb %xmm1, %xmm0
ret
.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
##
## Decryption core
##
@ -325,7 +325,7 @@ _vpaes_schedule_core:
##
.Lschedule_128:
mov \$10, %esi
.Loop_schedule_128:
call _vpaes_schedule_round
dec %rsi
@ -359,7 +359,7 @@ _vpaes_schedule_core:
.Loop_schedule_192:
call _vpaes_schedule_round
palignr \$8,%xmm6,%xmm0
palignr \$8,%xmm6,%xmm0
call _vpaes_schedule_mangle # save key n
call _vpaes_schedule_192_smear
call _vpaes_schedule_mangle # save key n+1
@ -385,7 +385,7 @@ _vpaes_schedule_core:
movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
call _vpaes_schedule_transform # input transform
mov \$7, %esi
.Loop_schedule_256:
call _vpaes_schedule_mangle # output low result
movdqa %xmm0, %xmm6 # save cur_lo in xmm6
@ -394,7 +394,7 @@ _vpaes_schedule_core:
call _vpaes_schedule_round
dec %rsi
jz .Lschedule_mangle_last
call _vpaes_schedule_mangle
call _vpaes_schedule_mangle
# low round. swap xmm7 and xmm6
pshufd \$0xFF, %xmm0, %xmm0
@ -402,10 +402,10 @@ _vpaes_schedule_core:
movdqa %xmm6, %xmm7
call _vpaes_schedule_low_round
movdqa %xmm5, %xmm7
jmp .Loop_schedule_256
##
## .aes_schedule_mangle_last
##
@ -504,9 +504,9 @@ _vpaes_schedule_round:
# rotate
pshufd \$0xFF, %xmm0, %xmm0
palignr \$1, %xmm0, %xmm0
# fall through...
# low round: same as high round, but no rotation and no rcon.
_vpaes_schedule_low_round:
# smear xmm7
@ -545,7 +545,7 @@ _vpaes_schedule_low_round:
pxor %xmm4, %xmm0 # 0 = sbox output
# add in smeared stuff
pxor %xmm7, %xmm0
pxor %xmm7, %xmm0
movdqa %xmm0, %xmm7
ret
.size _vpaes_schedule_round,.-_vpaes_schedule_round

View File

@ -16,7 +16,7 @@
# [depending on key length, less for longer keys] on ARM920T, and
# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
# base and compiler generated code with in-lined umull and even umlal
# instructions. The latter means that this code didn't really have an
# instructions. The latter means that this code didn't really have an
# "advantage" of utilizing some "secret" instruction.
#
# The code is interoperable with Thumb ISA and is rather compact, less

View File

@ -47,7 +47,7 @@ sub bn_mul_add_words
&movd("mm0",&wparam(3)); # mm0 = w
&pxor("mm1","mm1"); # mm1 = carry_in
&jmp(&label("maw_sse2_entry"));
&set_label("maw_sse2_unrolled",16);
&movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
&paddq("mm1","mm3"); # mm1 = carry_in + r[0]
@ -668,20 +668,20 @@ sub bn_sub_part_words
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_neg_loop"));
&set_label("pw_neg_finish",0);
&mov($tmp2,&wparam(4)); # get dl
&mov($num,0);
&sub($num,$tmp2);
&and($num,7);
&jz(&label("pw_end"));
for ($i=0; $i<7; $i++)
{
&comment("dl<0 Tail Round $i");
@ -698,9 +698,9 @@ sub bn_sub_part_words
}
&jmp(&label("pw_end"));
&set_label("pw_pos",0);
&and($num,0xfffffff8); # num / 8
&jz(&label("pw_pos_finish"));
@ -715,18 +715,18 @@ sub bn_sub_part_words
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jnc(&label("pw_nc".$i));
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_pos_loop"));
&set_label("pw_pos_finish",0);
&mov($num,&wparam(4)); # get dl
&and($num,7);
&jz(&label("pw_end"));
for ($i=0; $i<7; $i++)
{
&comment("dl>0 Tail Round $i");
@ -747,17 +747,17 @@ sub bn_sub_part_words
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&set_label("pw_nc".$i,0);
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_nc_loop"));
&mov($num,&wparam(4)); # get dl
&and($num,7);
&jz(&label("pw_nc_end"));
for ($i=0; $i<7; $i++)
{
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a

View File

@ -41,7 +41,7 @@ sub mul_add_c
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
}
@ -70,7 +70,7 @@ sub sqr_add_c
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
}
@ -121,7 +121,7 @@ sub bn_mul_comba
$c2="ebp";
$a="esi";
$b="edi";
$as=0;
$ae=0;
$bs=0;
@ -136,9 +136,9 @@ sub bn_mul_comba
&push("ebx");
&xor($c0,$c0);
&mov("eax",&DWP(0,$a,"",0)); # load the first word
&mov("eax",&DWP(0,$a,"",0)); # load the first word
&xor($c1,$c1);
&mov("edx",&DWP(0,$b,"",0)); # load the first second
&mov("edx",&DWP(0,$b,"",0)); # load the first second
for ($i=0; $i<$tot; $i++)
{
@ -146,7 +146,7 @@ sub bn_mul_comba
$bi=$bs;
$end=$be+1;
&comment("################## Calculate word $i");
&comment("################## Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{

View File

@ -359,7 +359,7 @@ $code.=<<___;
vpaddq $TEMP1, $ACC1, $ACC1
vpmuludq 32*7-128($aap), $B2, $ACC2
vpbroadcastq 32*5-128($tpa), $B2
vpaddq 32*11-448($tp1), $ACC2, $ACC2
vpaddq 32*11-448($tp1), $ACC2, $ACC2
vmovdqu $ACC6, 32*6-192($tp0)
vmovdqu $ACC7, 32*7-192($tp0)
@ -418,7 +418,7 @@ $code.=<<___;
vmovdqu $ACC7, 32*16-448($tp1)
lea 8($tp1), $tp1
dec $i
dec $i
jnz .LOOP_SQR_1024
___
$ZERO = $ACC9;
@ -763,7 +763,7 @@ $code.=<<___;
vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
vpaddq $TEMP3, $ACC7, $ACC7
vpaddq $TEMP4, $ACC8, $ACC8
vpsrlq \$29, $ACC4, $TEMP1
vpand $AND_MASK, $ACC4, $ACC4
vpsrlq \$29, $ACC5, $TEMP2
@ -1429,7 +1429,7 @@ $code.=<<___;
vpaddq $TEMP4, $ACC8, $ACC8
vmovdqu $ACC4, 128-128($rp)
vmovdqu $ACC5, 160-128($rp)
vmovdqu $ACC5, 160-128($rp)
vmovdqu $ACC6, 192-128($rp)
vmovdqu $ACC7, 224-128($rp)
vmovdqu $ACC8, 256-128($rp)

View File

@ -32,7 +32,7 @@ require "x86asm.pl";
$output = pop;
open STDOUT,">$output";
&asm_init($ARGV[0],$0);
$sse2=0;

View File

@ -1010,7 +1010,7 @@ my $bptr="%rdx"; # const void *table,
my $nptr="%rcx"; # const BN_ULONG *nptr,
my $n0 ="%r8"; # const BN_ULONG *n0);
my $num ="%r9"; # int num, has to be divisible by 8
# int pwr
# int pwr
my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
my @A0=("%r10","%r11");
@ -1078,7 +1078,7 @@ $code.=<<___;
.byte 0x2e # predict non-taken
jnc .Lpwr_page_walk
mov $num,%r10
mov $num,%r10
neg $num
##############################################################
@ -1987,7 +1987,7 @@ __bn_post4x_internal:
jnz .Lsqr4x_sub
mov $num,%r10 # prepare for back-to-back call
neg $num # restore $num
neg $num # restore $num
ret
.size __bn_post4x_internal,.-__bn_post4x_internal
___
@ -2199,7 +2199,7 @@ bn_mulx4x_mont_gather5:
mov \$0,%r10
cmovc %r10,%r11
sub %r11,%rsp
.Lmulx4xsp_done:
.Lmulx4xsp_done:
and \$-64,%rsp # ensure alignment
mov %rax,%r11
sub %rsp,%r11
@ -2665,7 +2665,7 @@ bn_powerx5:
.byte 0x2e # predict non-taken
jnc .Lpwrx_page_walk
mov $num,%r10
mov $num,%r10
neg $num
##############################################################

View File

@ -8,7 +8,7 @@
# ====================================================================
#
# December 2014
#
#
# ChaCha20 for ARMv4.
#
# Performance in cycles per byte out of large buffer.
@ -713,7 +713,7 @@ ChaCha20_neon:
vadd.i32 $d2,$d1,$t0 @ counter+2
str @t[3], [sp,#4*(16+15)]
mov @t[3],#10
add @x[12],@x[12],#3 @ counter+3
add @x[12],@x[12],#3 @ counter+3
b .Loop_neon
.align 4

View File

@ -8,7 +8,7 @@
# ====================================================================
#
# June 2015
#
#
# ChaCha20 for ARMv8.
#
# Performance in cycles per byte out of large buffer.
@ -193,7 +193,7 @@ ChaCha20_ctr32:
mov $ctr,#10
subs $len,$len,#64
.Loop:
sub $ctr,$ctr,#1
sub $ctr,$ctr,#1
___
foreach (&ROUND(0, 4, 8,12)) { eval; }
foreach (&ROUND(0, 5,10,15)) { eval; }

View File

@ -289,7 +289,7 @@ __ecp_nistz256_mul_montq:
adc \$0, $acc0
########################################################################
# Second reduction step
# Second reduction step
mov $acc1, $t1
shl \$32, $acc1
mulq $poly3
@ -336,7 +336,7 @@ __ecp_nistz256_mul_montq:
adc \$0, $acc1
########################################################################
# Third reduction step
# Third reduction step
mov $acc2, $t1
shl \$32, $acc2
mulq $poly3
@ -383,7 +383,7 @@ __ecp_nistz256_mul_montq:
adc \$0, $acc2
########################################################################
# Final reduction step
# Final reduction step
mov $acc3, $t1
shl \$32, $acc3
mulq $poly3
@ -396,7 +396,7 @@ __ecp_nistz256_mul_montq:
mov $acc5, $t1
adc \$0, $acc2
########################################################################
########################################################################
# Branch-less conditional subtraction of P
sub \$-1, $acc4 # .Lpoly[0]
mov $acc0, $t2
@ -1649,7 +1649,7 @@ $code.=<<___;
movq %xmm1, $r_ptr
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S);
___
{
{
######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
# operate in 4-5-6-7 "name space" that matches squaring output
#
@ -1738,7 +1738,7 @@ $code.=<<___;
lea $M(%rsp), $b_ptr
mov $acc4, $acc6 # harmonize sub output and mul input
xor %ecx, %ecx
mov $acc4, $S+8*0(%rsp) # have to save:-(
mov $acc4, $S+8*0(%rsp) # have to save:-(
mov $acc5, $acc2
mov $acc5, $S+8*1(%rsp)
cmovz $acc0, $acc3

View File

@ -50,7 +50,7 @@ sub R0
local($pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_;
&mov($tmp1,$C) if $pos < 0;
&mov($tmp2,&DWP($xo[$ki]*4,$K,"",0)) if $pos < 0; # very first one
&mov($tmp2,&DWP($xo[$ki]*4,$K,"",0)) if $pos < 0; # very first one
# body proper

View File

@ -47,7 +47,7 @@
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
# ====================================================================
@ -486,7 +486,7 @@ $code.=<<___;
#ifdef __ARMEL__
vrev64.8 $Xl,$Xl
#endif
sub $Xi,#16
sub $Xi,#16
vst1.64 $Xl#hi,[$Xi]! @ write out Xi
vst1.64 $Xl#lo,[$Xi]

View File

@ -576,7 +576,7 @@ sub mmx_loop() {
&bswap ($dat);
&pshufw ($Zhi,$Zhi,0b00011011); # 76543210
&bswap ("ebx");
&cmp ("ecx",&DWP(528+16+8,"esp")); # are we done?
&jne (&label("outer"));
}
@ -680,7 +680,7 @@ my ($Xhi,$Xi) = @_;
&psllq ($Xi,57); #
&movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
&psrldq ($T1,8); #
&psrldq ($T1,8); #
&pxor ($Xi,$T2);
&pxor ($Xhi,$T1); #
@ -850,7 +850,7 @@ my ($Xhi,$Xi) = @_;
&psllq ($Xi,57); #
&movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
&psrldq ($T1,8); #
&psrldq ($T1,8); #
&pxor ($Xi,$T2);
&pxor ($Xhi,$T1); #
&pshufd ($T1,$Xhn,0b01001110);

View File

@ -449,7 +449,7 @@ $code.=<<___;
psllq \$57,$Xi #
movdqa $Xi,$T1 #
pslldq \$8,$Xi
psrldq \$8,$T1 #
psrldq \$8,$T1 #
pxor $T2,$Xi
pxor $T1,$Xhi #
@ -563,7 +563,7 @@ ___
&clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
# experimental alternative. special thing about is that there
# no dependency between the two multiplications...
# no dependency between the two multiplications...
mov \$`0xE1<<1`,%eax
mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
mov \$0x07,%r11d
@ -738,7 +738,7 @@ $code.=<<___;
movdqa $T2,$T1 #
pslldq \$8,$T2
pclmulqdq \$0x00,$Hkey2,$Xln
psrldq \$8,$T1 #
psrldq \$8,$T1 #
pxor $T2,$Xi
pxor $T1,$Xhi #
movdqu 0($inp),$T1
@ -874,7 +874,7 @@ $code.=<<___;
psllq \$57,$Xi #
movdqa $Xi,$T1 #
pslldq \$8,$Xi
psrldq \$8,$T1 #
psrldq \$8,$T1 #
pxor $T2,$Xi
pshufd \$0b01001110,$Xhn,$Xmn
pxor $T1,$Xhi #

View File

@ -36,7 +36,7 @@ my $globl = sub {
my $ret;
$name =~ s|^\.||;
SWITCH: for ($flavour) {
/aix/ && do { if (!$$type) {
$$type = "\@function";

View File

@ -140,7 +140,7 @@ ___
grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out;
push (@out,$comm)
}
push (@out,$initseg) if ($initseg);
push (@out,$initseg) if ($initseg);
}
sub ::comment { foreach (@_) { push(@out,"\t; $_\n"); } }

View File

@ -246,7 +246,7 @@ sha1_block_data_order:
jz .Lialu
___
$code.=<<___ if ($shaext);
test \$`1<<29`,%r10d # check SHA bit
test \$`1<<29`,%r10d # check SHA bit
jnz _shaext_shortcut
___
$code.=<<___ if ($avx>1);

View File

@ -40,7 +40,7 @@
#
# Performance in clock cycles per processed byte (less is better):
#
# gcc icc x86 asm(*) SIMD x86_64 asm(**)
# gcc icc x86 asm(*) SIMD x86_64 asm(**)
# Pentium 46 57 40/38 - -
# PIII 36 33 27/24 - -
# P4 41 38 28 - 17.3
@ -263,7 +263,7 @@ my $suffix=shift;
&mov ($Coff,"ecx");
&mov ($Doff,"edi");
&mov (&DWP(0,"esp"),"ebx"); # magic
&mov ($E,&DWP(16,"esi"));
&mov ($E,&DWP(16,"esi"));
&mov ("ebx",&DWP(20,"esi"));
&mov ("ecx",&DWP(24,"esi"));
&mov ("edi",&DWP(28,"esi"));
@ -372,7 +372,7 @@ my @AH=($A,$K256);
&xor ($AH[1],"ecx"); # magic
&mov (&DWP(8,"esp"),"ecx");
&mov (&DWP(12,"esp"),"ebx");
&mov ($E,&DWP(16,"esi"));
&mov ($E,&DWP(16,"esi"));
&mov ("ebx",&DWP(20,"esi"));
&mov ("ecx",&DWP(24,"esi"));
&mov ("esi",&DWP(28,"esi"));

View File

@ -376,7 +376,7 @@ if ($sse2) {
&set_label("16_79_sse2",16);
for ($j=0;$j<2;$j++) { # 2x unroll
#&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15
#&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15
&movq ("mm5",&QWP(8*(9+16-14),"esp"));
&movq ("mm1","mm7");
&psrlq ("mm7",1);

View File

@ -18,7 +18,7 @@
# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
# Denver 2.01 10.5 (+26%) 6.70 (+8%)
# X-Gene 20.0 (+100%) 12.8 (+300%(***))
#
#
# (*) Software SHA256 results are of lesser relevance, presented
# mostly for informational purposes.
# (**) The result is a trade-off: it's possible to improve it by

View File

@ -1766,7 +1766,7 @@ if ($avx>1) {{
######################################################################
# AVX2+BMI code path
#
my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
my $PUSH8=8*2*$SZ;
use integer;