161 lines
4.0 KiB
Raku
161 lines
4.0 KiB
Raku
#! /usr/bin/env perl
|
|
# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the OpenSSL license (the "License"). You may not use
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
# in the file LICENSE in the source distribution or at
|
|
# https://www.openssl.org/source/license.html
|
|
|
|
#
|
|
# ====================================================================
|
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
# project. The module is, however, dual licensed under OpenSSL and
|
|
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
# details see http://www.openssl.org/~appro/cryptogams/.
|
|
# ====================================================================
|
|
#
|
|
# February 2012
|
|
#
|
|
# The module implements bn_GF2m_mul_2x2 polynomial multiplication
|
|
# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
|
|
# C for the time being... The subroutine runs in 37 cycles, which is
|
|
# 4.5x faster than compiler-generated code. Though comparison is
|
|
# totally unfair, because this module utilizes Galois Field Multiply
|
|
# instruction.
|
|
|
|
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
|
|
open STDOUT,">$output";
|
|
|
|
($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8"); # argument vector
|
|
|
|
($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
|
|
($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
|
|
($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
|
|
($A,$B)=($Alo,$B_1);
|
|
$xFF="B1";
|
|
|
|
sub mul_1x1_upper {
|
|
my ($A,$B)=@_;
|
|
$code.=<<___;
|
|
EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
|
|
|| AND $B,$xFF,$B_0
|
|
|| SHRU $B,24,$B_3
|
|
SHRU $A,16, $Ahi ; smash $A to two halfwords
|
|
|| EXTU $A,16,16,$Alo
|
|
|
|
XORMPY $Alo,$B_2,$Alox2 ; 16x8 bits multiplication
|
|
|| XORMPY $Ahi,$B_2,$Ahix2
|
|
|| EXTU $B,16,24,$B_1
|
|
XORMPY $Alo,$B_0,$Alox0
|
|
|| XORMPY $Ahi,$B_0,$Ahix0
|
|
XORMPY $Alo,$B_3,$Alox3
|
|
|| XORMPY $Ahi,$B_3,$Ahix3
|
|
XORMPY $Alo,$B_1,$Alox1
|
|
|| XORMPY $Ahi,$B_1,$Ahix1
|
|
___
|
|
}
|
|
sub mul_1x1_merged {
|
|
my ($OUTlo,$OUThi,$A,$B)=@_;
|
|
$code.=<<___;
|
|
EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
|
|
|| AND $B,$xFF,$B_0
|
|
|| SHRU $B,24,$B_3
|
|
SHRU $A,16, $Ahi ; smash $A to two halfwords
|
|
|| EXTU $A,16,16,$Alo
|
|
|
|
XOR $Ahix0,$Alox2,$Ahix0
|
|
|| MV $Ahix2,$OUThi
|
|
|| XORMPY $Alo,$B_2,$Alox2
|
|
XORMPY $Ahi,$B_2,$Ahix2
|
|
|| EXTU $B,16,24,$B_1
|
|
|| XORMPY $Alo,$B_0,A1 ; $Alox0
|
|
XOR $Ahix1,$Alox3,$Ahix1
|
|
|| SHL $Ahix0,16,$OUTlo
|
|
|| SHRU $Ahix0,16,$Ahix0
|
|
XOR $Alox0,$OUTlo,$OUTlo
|
|
|| XOR $Ahix0,$OUThi,$OUThi
|
|
|| XORMPY $Ahi,$B_0,$Ahix0
|
|
|| XORMPY $Alo,$B_3,$Alox3
|
|
|| SHL $Alox1,8,$Alox1
|
|
|| SHL $Ahix3,8,$Ahix3
|
|
XOR $Alox1,$OUTlo,$OUTlo
|
|
|| XOR $Ahix3,$OUThi,$OUThi
|
|
|| XORMPY $Ahi,$B_3,$Ahix3
|
|
|| SHL $Ahix1,24,$Alox1
|
|
|| SHRU $Ahix1,8, $Ahix1
|
|
XOR $Alox1,$OUTlo,$OUTlo
|
|
|| XOR $Ahix1,$OUThi,$OUThi
|
|
|| XORMPY $Alo,$B_1,$Alox1
|
|
|| XORMPY $Ahi,$B_1,$Ahix1
|
|
|| MV A1,$Alox0
|
|
___
|
|
}
|
|
sub mul_1x1_lower {
|
|
my ($OUTlo,$OUThi)=@_;
|
|
$code.=<<___;
|
|
;NOP
|
|
XOR $Ahix0,$Alox2,$Ahix0
|
|
|| MV $Ahix2,$OUThi
|
|
NOP
|
|
XOR $Ahix1,$Alox3,$Ahix1
|
|
|| SHL $Ahix0,16,$OUTlo
|
|
|| SHRU $Ahix0,16,$Ahix0
|
|
XOR $Alox0,$OUTlo,$OUTlo
|
|
|| XOR $Ahix0,$OUThi,$OUThi
|
|
|| SHL $Alox1,8,$Alox1
|
|
|| SHL $Ahix3,8,$Ahix3
|
|
XOR $Alox1,$OUTlo,$OUTlo
|
|
|| XOR $Ahix3,$OUThi,$OUThi
|
|
|| SHL $Ahix1,24,$Alox1
|
|
|| SHRU $Ahix1,8, $Ahix1
|
|
XOR $Alox1,$OUTlo,$OUTlo
|
|
|| XOR $Ahix1,$OUThi,$OUThi
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
.text
|
|
|
|
.if .ASSEMBLER_VERSION<7000000
|
|
.asg 0,__TI_EABI__
|
|
.endif
|
|
.if __TI_EABI__
|
|
.asg bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2
|
|
.endif
|
|
|
|
.global _bn_GF2m_mul_2x2
|
|
_bn_GF2m_mul_2x2:
|
|
.asmfunc
|
|
MVK 0xFF,$xFF
|
|
___
|
|
&mul_1x1_upper($a0,$b0); # a0·b0
|
|
$code.=<<___;
|
|
|| MV $b1,$B
|
|
MV $a1,$A
|
|
___
|
|
&mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1
|
|
$code.=<<___;
|
|
|| XOR $b0,$b1,$B
|
|
XOR $a0,$a1,$A
|
|
___
|
|
&mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1)
|
|
$code.=<<___;
|
|
XOR A28,A31,A29
|
|
|| XOR B28,B31,B29 ; a0·b0+a1·b1
|
|
___
|
|
&mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1)
|
|
$code.=<<___;
|
|
|| BNOP B3
|
|
XOR A29,A30,A30
|
|
|| XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1
|
|
XOR B28,A30,A30
|
|
|| STW A28,*${rp}[0]
|
|
XOR B30,A31,A31
|
|
|| STW A30,*${rp}[1]
|
|
STW A31,*${rp}[2]
|
|
STW B31,*${rp}[3]
|
|
.endasmfunc
|
|
___
|
|
|
|
print $code;
|
|
close STDOUT or die "error closing STDOUT: $!";
|