openssl/crypto/bn/asm/bn-c64xplus.asm

;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
;;
;; Licensed under the OpenSSL license (the "License").  You may not use
;; this file except in compliance with the License.  You can obtain a copy
;; in the file LICENSE in the source distribution or at
;; https://www.openssl.org/source/license.html
;;
;;====================================================================
;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
;; project.
;;
;; Rights for redistribution and usage in source and binary forms are
;; granted according to the OpenSSL license. Warranty of any kind is
;; disclaimed.
;;====================================================================
;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
;;====================================================================
	.text

	.if	.ASSEMBLER_VERSION<7000000
	.asg	0,__TI_EABI__
	.endif
	.if	__TI_EABI__
	.asg	bn_mul_add_words,_bn_mul_add_words
	.asg	bn_mul_words,_bn_mul_words
	.asg	bn_sqr_words,_bn_sqr_words
	.asg	bn_add_words,_bn_add_words
	.asg	bn_sub_words,_bn_sub_words
	.asg	bn_div_words,_bn_div_words
	.asg	bn_sqr_comba8,_bn_sqr_comba8
	.asg	bn_mul_comba8,_bn_mul_comba8
	.asg	bn_sqr_comba4,_bn_sqr_comba4
	.asg	bn_mul_comba4,_bn_mul_comba4
	.endif

	.asg	B3,RA
	.asg	A4,ARG0
	.asg	B4,ARG1
	.asg	A6,ARG2
	.asg	B6,ARG3
	.asg	A8,ARG4
	.asg	B8,ARG5
	.asg	A4,RET
	.asg	A15,FP
	.asg	B14,DP
	.asg	B15,SP

	.global	_bn_mul_add_words
_bn_mul_add_words:
	.asmfunc
	MV	ARG2,B0
  [!B0]	BNOP	RA
||[!B0]	MVK	0,RET
   [B0]	MVC	B0,ILC
   [B0]	ZERO	A19		; high part of accumulator
|| [B0]	MV	ARG0,A2
|| [B0]	MV	ARG3,A3
	NOP	3

	SPLOOP	2		; 2*n+10
;;====================================================================
	LDW	*ARG1++,B7	; ap[i]
	NOP	3
	LDW	*ARG0++,A7	; rp[i]
	MPY32U	B7,A3,A17:A16
	NOP	3		; [2,0] in epilogue
	ADDU	A16,A7,A21:A20
	ADDU	A19,A21:A20,A19:A18
||	MV.S	A17,A23
	SPKERNEL 2,1		; leave slot for "return value"
||	STW	A18,*A2++	; rp[i]
||	ADD	A19,A23,A19
;;====================================================================
	BNOP	RA,4
	MV	A19,RET		; return value
	.endasmfunc

	.global	_bn_mul_words
_bn_mul_words:
	.asmfunc
	MV	ARG2,B0
  [!B0]	BNOP	RA
||[!B0]	MVK	0,RET
   [B0]	MVC	B0,ILC
   [B0]	ZERO	A19		; high part of accumulator
	NOP	3

	SPLOOP	2		; 2*n+10
;;====================================================================
	LDW	*ARG1++,A7	; ap[i]
	NOP	4
	MPY32U	A7,ARG3,A17:A16
	NOP	4		; [2,0] in epiloque
	ADDU	A19,A16,A19:A18
||	MV.S	A17,A21
	SPKERNEL 2,1		; leave slot for "return value"
||	STW	A18,*ARG0++	; rp[i]
||	ADD.L	A19,A21,A19
;;====================================================================
	BNOP	RA,4
	MV	A19,RET		; return value
	.endasmfunc

	.global	_bn_sqr_words
_bn_sqr_words:
	.asmfunc
	MV	ARG2,B0
  [!B0]	BNOP	RA
||[!B0]	MVK	0,RET
   [B0]	MVC	B0,ILC
   [B0]	MV	ARG0,B2
|| [B0]	ADD	4,ARG0,ARG0
	NOP	3

	SPLOOP	2		; 2*n+10
;;====================================================================
	LDW	*ARG1++,B7	; ap[i]
	NOP	4
	MPY32U	B7,B7,B1:B0
	NOP	3		; [2,0] in epilogue
	STW	B0,*B2++(8)	; rp[2*i]
	MV	B1,A1
	SPKERNEL 2,0		; fully overlap BNOP RA,5
||	STW	A1,*ARG0++(8)	; rp[2*i+1]
;;====================================================================
	BNOP	RA,5
	.endasmfunc

	.global	_bn_add_words
_bn_add_words:
	.asmfunc
	MV	ARG3,B0
  [!B0]	BNOP	RA
||[!B0]	MVK	0,RET
   [B0]	MVC	B0,ILC
   [B0]	ZERO	A1		; carry flag
|| [B0]	MV	ARG0,A3
	NOP	3

	SPLOOP	2		; 2*n+6
;;====================================================================
	LDW	*ARG2++,A7	; bp[i]
||	LDW	*ARG1++,B7	; ap[i]
	NOP	4
	ADDU	A7,B7,A9:A8
	ADDU	A1,A9:A8,A1:A0
	SPKERNEL 0,0		; fully overlap BNOP RA,5
||	STW	A0,*A3++	; write result
||	MV	A1,RET		; keep carry flag in RET
;;====================================================================
	BNOP	RA,5
	.endasmfunc

	.global	_bn_sub_words
_bn_sub_words:
	.asmfunc
	MV	ARG3,B0
  [!B0]	BNOP	RA
||[!B0]	MVK	0,RET
   [B0]	MVC	B0,ILC
   [B0]	ZERO	A2		; borrow flag
|| [B0]	MV	ARG0,A3
	NOP	3

	SPLOOP	2		; 2*n+6
;;====================================================================
	LDW	*ARG2++,A7	; bp[i]
||	LDW	*ARG1++,B7	; ap[i]
	NOP	4
	SUBU	B7,A7,A1:A0
  [A2]	SUB	A1:A0,1,A1:A0
	SPKERNEL 0,1		; leave slot for "return borrow flag"
||	STW	A0,*A3++	; write result
||	AND	1,A1,A2		; pass on borrow flag
;;====================================================================
	BNOP	RA,4
	AND	1,A1,RET	; return borrow flag
	.endasmfunc

	.global	_bn_div_words
_bn_div_words:
	.asmfunc
	LMBD	1,A6,A0		; leading zero bits in dv
	LMBD	1,A4,A1		; leading zero bits in hi
||	MVK	32,B0
	CMPLTU	A1,A0,A2
||	ADD	A0,B0,B0
  [ A2]	BNOP	RA
||[ A2]	MVK	-1,A4		; return overflow
||[!A2]	MV	A4,A3		; reassign hi
  [!A2]	MV	B4,A4		; reassign lo, will be quotient
||[!A2]	MVC	B0,ILC
  [!A2]	SHL	A6,A0,A6	; normalize dv
||	MVK	1,A1

  [!A2]	CMPLTU	A3,A6,A1	; hi<dv?
||[!A2]	SHL	A4,1,A5:A4	; lo<<1
  [!A1]	SUB	A3,A6,A3	; hi-=dv
||[!A1]	OR	1,A4,A4
  [!A2]	SHRU	A3,31,A1	; upper bit
||[!A2]	ADDAH	A5,A3,A3	; hi<<1|lo>>31

	SPLOOP	3
  [!A1]	CMPLTU	A3,A6,A1	; hi<dv?
||[ A1]	ZERO	A1
||	SHL	A4,1,A5:A4	; lo<<1
  [!A1]	SUB	A3,A6,A3	; hi-=dv
||[!A1]	OR	1,A4,A4		; quotient
	SHRU	A3,31,A1	; upper bit
||	ADDAH	A5,A3,A3	; hi<<1|lo>>31
	SPKERNEL

	BNOP	RA,5
	.endasmfunc

;;====================================================================
;; Not really Comba algorithm, just straightforward NxM... Dedicated
;; fully unrolled real Comba implementations are asymptotically 2x
;; faster, but naturally larger undertaking. Purpose of this exercise
;; was rather to learn to master nested SPLOOPs...
;;====================================================================
	.global	_bn_sqr_comba8
	.global	_bn_mul_comba8
_bn_sqr_comba8:
	MV	ARG1,ARG2
_bn_mul_comba8:
	.asmfunc
	MVK	8,B0		; N, RILC
||	MVK	8,A0		; M, outer loop counter
||	MV	ARG1,A5		; copy ap
||	MV	ARG0,B4		; copy rp
||	ZERO	B19		; high part of accumulator
	MVC	B0,RILC
||	SUB	B0,2,B1		; N-2, initial ILC
||	SUB	B0,1,B2		; const B2=N-1
||	LDW	*A5++,B6	; ap[0]
||	MV	A0,A3		; const A3=M
sploopNxM?:			; for best performance arrange M<=N
   [A0]	SPLOOPD	2		; 2*n+10
||	MVC	B1,ILC
||	ADDAW	B4,B0,B5
||	ZERO	B7
||	LDW	*A5++,A9	; pre-fetch ap[1]
||	ZERO	A1
||	SUB	A0,1,A0
;;====================================================================
;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
;; This is because of Advisory 15 from TI publication SPRZ247I.
	LDW	*ARG2++,A7	; bp[i]
	NOP	3
   [A1]	LDW	*B5++,B7	; rp[i]
	MPY32U	A7,B6,B17:B16
	NOP	3
	ADDU	B16,B7,B21:B20
	ADDU	B19,B21:B20,B19:B18
||	MV.S	B17,B23
	SPKERNEL
||	STW	B18,*B4++	; rp[i]
||	ADD.S	B19,B23,B19
;;====================================================================
outer?:				; m*2*(n+1)+10
	SUBAW	ARG2,A3,ARG2	; rewind bp to bp[0]
	SPMASKR
||	CMPGT	A0,1,A2		; done pre-fetching ap[i+1]?
	MVD	A9,B6		; move through .M unit(*)
   [A2]	LDW	*A5++,A9	; pre-fetch ap[i+1]
	SUBAW	B5,B2,B5	; rewind rp to rp[1]
	MVK	1,A1
   [A0]	BNOP.S1	outer?,4
|| [A0]	SUB.L	A0,1,A0
	STW	B19,*B4--[B2]	; rewind rp tp rp[1]
||	ZERO.S	B19		; high part of accumulator
;; end of outer?
	BNOP	RA,5		; return
	.endasmfunc
;; (*)	It should be noted that B6 is used as input to MPY32U in
;;	chronologically next cycle in *preceding* SPLOOP iteration.
;;	Normally such arrangement would require DINT, but at this
;;	point SPLOOP is draining and interrupts are disabled
;;	implicitly.

	.global	_bn_sqr_comba4
	.global	_bn_mul_comba4
_bn_sqr_comba4:
	MV	ARG1,ARG2
_bn_mul_comba4:
	.asmfunc
	.if	0
	BNOP	sploopNxM?,3
	;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
	;; because of low-counter effect, when prologue phase finishes
	;; before SPKERNEL instruction is reached. As result it's 25%
	;; slower than expected...
	MVK	4,B0		; N, RILC
||	MVK	4,A0		; M, outer loop counter
||	MV	ARG1,A5		; copy ap
||	MV	ARG0,B4		; copy rp
||	ZERO	B19		; high part of accumulator
	MVC	B0,RILC
||	SUB	B0,2,B1		; first ILC
||	SUB	B0,1,B2		; const B2=N-1
||	LDW	*A5++,B6	; ap[0]
||	MV	A0,A3		; const A3=M
	.else
	;; This alternative is an exercise in fully unrolled Comba
	;; algorithm implementation that operates at n*(n+1)+12, or
	;; as little as 32 cycles...
	LDW	*ARG1[0],B16	; a[0]
||	LDW	*ARG2[0],A16	; b[0]
	LDW	*ARG1[1],B17	; a[1]
||	LDW	*ARG2[1],A17	; b[1]
	LDW	*ARG1[2],B18	; a[2]
||	LDW	*ARG2[2],A18	; b[2]
	LDW	*ARG1[3],B19	; a[3]
||	LDW	*ARG2[3],A19	; b[3]
	NOP
	MPY32U	A16,B16,A1:A0	; a[0]*b[0]
	MPY32U	A17,B16,A23:A22	; a[0]*b[1]
	MPY32U	A16,B17,A25:A24	; a[1]*b[0]
	MPY32U	A16,B18,A27:A26	; a[2]*b[0]
	STW	A0,*ARG0[0]
||	MPY32U	A17,B17,A29:A28	; a[1]*b[1]
	MPY32U	A18,B16,A31:A30	; a[0]*b[2]
||	ADDU	A22,A1,A1:A0
	MV	A23,B0
||	MPY32U	A19,B16,A21:A20	; a[3]*b[0]
||	ADDU	A24,A1:A0,A1:A0
	ADDU	A25,B0,B1:B0
||	STW	A0,*ARG0[1]
||	MPY32U	A18,B17,A23:A22	; a[2]*b[1]
||	ADDU	A26,A1,A9:A8
	ADDU	A27,B1,B9:B8
||	MPY32U	A17,B18,A25:A24	; a[1]*b[2]
||	ADDU	A28,A9:A8,A9:A8
	ADDU	A29,B9:B8,B9:B8
||	MPY32U	A16,B19,A27:A26	; a[0]*b[3]
||	ADDU	A30,A9:A8,A9:A8
	ADDU	A31,B9:B8,B9:B8
||	ADDU	B0,A9:A8,A9:A8
	STW	A8,*ARG0[2]
||	ADDU	A20,A9,A1:A0
	ADDU	A21,B9,B1:B0
||	MPY32U	A19,B17,A21:A20	; a[3]*b[1]
||	ADDU	A22,A1:A0,A1:A0
	ADDU	A23,B1:B0,B1:B0
||	MPY32U	A18,B18,A23:A22	; a[2]*b[2]
||	ADDU	A24,A1:A0,A1:A0
	ADDU	A25,B1:B0,B1:B0
||	MPY32U	A17,B19,A25:A24	; a[1]*b[3]
||	ADDU	A26,A1:A0,A1:A0
	ADDU	A27,B1:B0,B1:B0
||	ADDU	B8,A1:A0,A1:A0
	STW	A0,*ARG0[3]
||	MPY32U	A19,B18,A27:A26	; a[3]*b[2]
||	ADDU	A20,A1,A9:A8
	ADDU	A21,B1,B9:B8
||	MPY32U	A18,B19,A29:A28	; a[2]*b[3]
||	ADDU	A22,A9:A8,A9:A8
	ADDU	A23,B9:B8,B9:B8
||	MPY32U	A19,B19,A31:A30	; a[3]*b[3]
||	ADDU	A24,A9:A8,A9:A8
	ADDU	A25,B9:B8,B9:B8
||	ADDU	B0,A9:A8,A9:A8
	STW	A8,*ARG0[4]
||	ADDU	A26,A9,A1:A0
	ADDU	A27,B9,B1:B0
||	ADDU	A28,A1:A0,A1:A0
	ADDU	A29,B1:B0,B1:B0
||	BNOP	RA
||	ADDU	B8,A1:A0,A1:A0
	STW	A0,*ARG0[5]
||	ADDU	A30,A1,A9:A8
	ADD	A31,B1,B8
	ADDU	B0,A9:A8,A9:A8	; removed || to avoid cross-path stall below
	ADD	B8,A9,A9
||	STW	A8,*ARG0[6]
	STW	A9,*ARG0[7]
	.endif
	.endasmfunc
v1.1.1t 2023-05-09 22:08:48 +00:00			`;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.`
			`;;`
			`;; Licensed under the OpenSSL license (the "License"). You may not use`
			`;; this file except in compliance with the License. You can obtain a copy`
			`;; in the file LICENSE in the source distribution or at`
			`;; https://www.openssl.org/source/license.html`
			`;;`
			`;;====================================================================`
			`;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL`
			`;; project.`
			`;;`
			`;; Rights for redistribution and usage in source and binary forms are`
			`;; granted according to the OpenSSL license. Warranty of any kind is`
			`;; disclaimed.`
			`;;====================================================================`
			`;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n`
			`;; being the number of 32-bit words, addition - 8*n. Corresponding 4x`
			`;; unrolled SPLOOP-free loops - at ~8n and ~5n. Below assembler`
			`;; SPLOOPs spin at ... 2*n cycles [plus epilogue].`
			`;;====================================================================`
			`.text`

			`.if .ASSEMBLER_VERSION<7000000`
			`.asg 0,__TI_EABI__`
			`.endif`
			`.if __TI_EABI__`
			`.asg bn_mul_add_words,_bn_mul_add_words`
			`.asg bn_mul_words,_bn_mul_words`
			`.asg bn_sqr_words,_bn_sqr_words`
			`.asg bn_add_words,_bn_add_words`
			`.asg bn_sub_words,_bn_sub_words`
			`.asg bn_div_words,_bn_div_words`
			`.asg bn_sqr_comba8,_bn_sqr_comba8`
			`.asg bn_mul_comba8,_bn_mul_comba8`
			`.asg bn_sqr_comba4,_bn_sqr_comba4`
			`.asg bn_mul_comba4,_bn_mul_comba4`
			`.endif`

			`.asg B3,RA`
			`.asg A4,ARG0`
			`.asg B4,ARG1`
			`.asg A6,ARG2`
			`.asg B6,ARG3`
			`.asg A8,ARG4`
			`.asg B8,ARG5`
			`.asg A4,RET`
			`.asg A15,FP`
			`.asg B14,DP`
			`.asg B15,SP`

			`.global _bn_mul_add_words`
			`_bn_mul_add_words:`
			`.asmfunc`
			`MV ARG2,B0`
			`[!B0] BNOP RA`
			`\|\|[!B0] MVK 0,RET`
			`[B0] MVC B0,ILC`
			`[B0] ZERO A19 ; high part of accumulator`
			`\|\| [B0] MV ARG0,A2`
			`\|\| [B0] MV ARG3,A3`
			`NOP 3`

			`SPLOOP 2 ; 2*n+10`
			`;;====================================================================`
			`LDW *ARG1++,B7 ; ap[i]`
			`NOP 3`
			`LDW *ARG0++,A7 ; rp[i]`
			`MPY32U B7,A3,A17:A16`
			`NOP 3 ; [2,0] in epilogue`
			`ADDU A16,A7,A21:A20`
			`ADDU A19,A21:A20,A19:A18`
			`\|\| MV.S A17,A23`
			`SPKERNEL 2,1 ; leave slot for "return value"`
			`\|\| STW A18,*A2++ ; rp[i]`
			`\|\| ADD A19,A23,A19`
			`;;====================================================================`
			`BNOP RA,4`
			`MV A19,RET ; return value`
			`.endasmfunc`

			`.global _bn_mul_words`
			`_bn_mul_words:`
			`.asmfunc`
			`MV ARG2,B0`
			`[!B0] BNOP RA`
			`\|\|[!B0] MVK 0,RET`
			`[B0] MVC B0,ILC`
			`[B0] ZERO A19 ; high part of accumulator`
			`NOP 3`

			`SPLOOP 2 ; 2*n+10`
			`;;====================================================================`
			`LDW *ARG1++,A7 ; ap[i]`
			`NOP 4`
			`MPY32U A7,ARG3,A17:A16`
			`NOP 4 ; [2,0] in epiloque`
			`ADDU A19,A16,A19:A18`
			`\|\| MV.S A17,A21`
			`SPKERNEL 2,1 ; leave slot for "return value"`
			`\|\| STW A18,*ARG0++ ; rp[i]`
			`\|\| ADD.L A19,A21,A19`
			`;;====================================================================`
			`BNOP RA,4`
			`MV A19,RET ; return value`
			`.endasmfunc`

			`.global _bn_sqr_words`
			`_bn_sqr_words:`
			`.asmfunc`
			`MV ARG2,B0`
			`[!B0] BNOP RA`
			`\|\|[!B0] MVK 0,RET`
			`[B0] MVC B0,ILC`
			`[B0] MV ARG0,B2`
			`\|\| [B0] ADD 4,ARG0,ARG0`
			`NOP 3`

			`SPLOOP 2 ; 2*n+10`
			`;;====================================================================`
			`LDW *ARG1++,B7 ; ap[i]`
			`NOP 4`
			`MPY32U B7,B7,B1:B0`
			`NOP 3 ; [2,0] in epilogue`
			`STW B0,B2++(8) ; rp[2i]`
			`MV B1,A1`
			`SPKERNEL 2,0 ; fully overlap BNOP RA,5`
			`\|\| STW A1,ARG0++(8) ; rp[2i+1]`
			`;;====================================================================`
			`BNOP RA,5`
			`.endasmfunc`

			`.global _bn_add_words`
			`_bn_add_words:`
			`.asmfunc`
			`MV ARG3,B0`
			`[!B0] BNOP RA`
			`\|\|[!B0] MVK 0,RET`
			`[B0] MVC B0,ILC`
			`[B0] ZERO A1 ; carry flag`
			`\|\| [B0] MV ARG0,A3`
			`NOP 3`

			`SPLOOP 2 ; 2*n+6`
			`;;====================================================================`
			`LDW *ARG2++,A7 ; bp[i]`
			`\|\| LDW *ARG1++,B7 ; ap[i]`
			`NOP 4`
			`ADDU A7,B7,A9:A8`
			`ADDU A1,A9:A8,A1:A0`
			`SPKERNEL 0,0 ; fully overlap BNOP RA,5`
			`\|\| STW A0,*A3++ ; write result`
			`\|\| MV A1,RET ; keep carry flag in RET`
			`;;====================================================================`
			`BNOP RA,5`
			`.endasmfunc`

			`.global _bn_sub_words`
			`_bn_sub_words:`
			`.asmfunc`
			`MV ARG3,B0`
			`[!B0] BNOP RA`
			`\|\|[!B0] MVK 0,RET`
			`[B0] MVC B0,ILC`
			`[B0] ZERO A2 ; borrow flag`
			`\|\| [B0] MV ARG0,A3`
			`NOP 3`

			`SPLOOP 2 ; 2*n+6`
			`;;====================================================================`
			`LDW *ARG2++,A7 ; bp[i]`
			`\|\| LDW *ARG1++,B7 ; ap[i]`
			`NOP 4`
			`SUBU B7,A7,A1:A0`
			`[A2] SUB A1:A0,1,A1:A0`
			`SPKERNEL 0,1 ; leave slot for "return borrow flag"`
			`\|\| STW A0,*A3++ ; write result`
			`\|\| AND 1,A1,A2 ; pass on borrow flag`
			`;;====================================================================`
			`BNOP RA,4`
			`AND 1,A1,RET ; return borrow flag`
			`.endasmfunc`

			`.global _bn_div_words`
			`_bn_div_words:`
			`.asmfunc`
			`LMBD 1,A6,A0 ; leading zero bits in dv`
			`LMBD 1,A4,A1 ; leading zero bits in hi`
			`\|\| MVK 32,B0`
			`CMPLTU A1,A0,A2`
			`\|\| ADD A0,B0,B0`
			`[ A2] BNOP RA`
			`\|\|[ A2] MVK -1,A4 ; return overflow`
			`\|\|[!A2] MV A4,A3 ; reassign hi`
			`[!A2] MV B4,A4 ; reassign lo, will be quotient`
			`\|\|[!A2] MVC B0,ILC`
			`[!A2] SHL A6,A0,A6 ; normalize dv`
			`\|\| MVK 1,A1`

			`[!A2] CMPLTU A3,A6,A1 ; hi<dv?`
			`\|\|[!A2] SHL A4,1,A5:A4 ; lo<<1`
			`[!A1] SUB A3,A6,A3 ; hi-=dv`
			`\|\|[!A1] OR 1,A4,A4`
			`[!A2] SHRU A3,31,A1 ; upper bit`
			`\|\|[!A2] ADDAH A5,A3,A3 ; hi<<1\|lo>>31`

			`SPLOOP 3`
			`[!A1] CMPLTU A3,A6,A1 ; hi<dv?`
			`\|\|[ A1] ZERO A1`
			`\|\| SHL A4,1,A5:A4 ; lo<<1`
			`[!A1] SUB A3,A6,A3 ; hi-=dv`
			`\|\|[!A1] OR 1,A4,A4 ; quotient`
			`SHRU A3,31,A1 ; upper bit`
			`\|\| ADDAH A5,A3,A3 ; hi<<1\|lo>>31`
			`SPKERNEL`

			`BNOP RA,5`
			`.endasmfunc`

			`;;====================================================================`
			`;; Not really Comba algorithm, just straightforward NxM... Dedicated`
			`;; fully unrolled real Comba implementations are asymptotically 2x`
			`;; faster, but naturally larger undertaking. Purpose of this exercise`
			`;; was rather to learn to master nested SPLOOPs...`
			`;;====================================================================`
			`.global _bn_sqr_comba8`
			`.global _bn_mul_comba8`
			`_bn_sqr_comba8:`
			`MV ARG1,ARG2`
			`_bn_mul_comba8:`
			`.asmfunc`
			`MVK 8,B0 ; N, RILC`
			`\|\| MVK 8,A0 ; M, outer loop counter`
			`\|\| MV ARG1,A5 ; copy ap`
			`\|\| MV ARG0,B4 ; copy rp`
			`\|\| ZERO B19 ; high part of accumulator`
			`MVC B0,RILC`
			`\|\| SUB B0,2,B1 ; N-2, initial ILC`
			`\|\| SUB B0,1,B2 ; const B2=N-1`
			`\|\| LDW *A5++,B6 ; ap[0]`
			`\|\| MV A0,A3 ; const A3=M`
			`sploopNxM?: ; for best performance arrange M<=N`
			`[A0] SPLOOPD 2 ; 2*n+10`
			`\|\| MVC B1,ILC`
			`\|\| ADDAW B4,B0,B5`
			`\|\| ZERO B7`
			`\|\| LDW *A5++,A9 ; pre-fetch ap[1]`
			`\|\| ZERO A1`
			`\|\| SUB A0,1,A0`
			`;;====================================================================`
			`;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.`
			`;; This is because of Advisory 15 from TI publication SPRZ247I.`
			`LDW *ARG2++,A7 ; bp[i]`
			`NOP 3`
			`[A1] LDW *B5++,B7 ; rp[i]`
			`MPY32U A7,B6,B17:B16`
			`NOP 3`
			`ADDU B16,B7,B21:B20`
			`ADDU B19,B21:B20,B19:B18`
			`\|\| MV.S B17,B23`
			`SPKERNEL`
			`\|\| STW B18,*B4++ ; rp[i]`
			`\|\| ADD.S B19,B23,B19`
			`;;====================================================================`
			`outer?: ; m2(n+1)+10`
			`SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0]`
			`SPMASKR`
			`\|\| CMPGT A0,1,A2 ; done pre-fetching ap[i+1]?`
			`MVD A9,B6 ; move through .M unit(*)`
			`[A2] LDW *A5++,A9 ; pre-fetch ap[i+1]`
			`SUBAW B5,B2,B5 ; rewind rp to rp[1]`
			`MVK 1,A1`
			`[A0] BNOP.S1 outer?,4`
			`\|\| [A0] SUB.L A0,1,A0`
			`STW B19,*B4--[B2] ; rewind rp tp rp[1]`
			`\|\| ZERO.S B19 ; high part of accumulator`
			`;; end of outer?`
			`BNOP RA,5 ; return`
			`.endasmfunc`
			`;; (*) It should be noted that B6 is used as input to MPY32U in`
			`;; chronologically next cycle in preceding SPLOOP iteration.`
			`;; Normally such arrangement would require DINT, but at this`
			`;; point SPLOOP is draining and interrupts are disabled`
			`;; implicitly.`

			`.global _bn_sqr_comba4`
			`.global _bn_mul_comba4`
			`_bn_sqr_comba4:`
			`MV ARG1,ARG2`
			`_bn_mul_comba4:`
			`.asmfunc`
			`.if 0`
			`BNOP sploopNxM?,3`
			`;; Above mentioned m2(n+1)+10 does not apply in n=m=4 case,`
			`;; because of low-counter effect, when prologue phase finishes`
			`;; before SPKERNEL instruction is reached. As result it's 25%`
			`;; slower than expected...`
			`MVK 4,B0 ; N, RILC`
			`\|\| MVK 4,A0 ; M, outer loop counter`
			`\|\| MV ARG1,A5 ; copy ap`
			`\|\| MV ARG0,B4 ; copy rp`
			`\|\| ZERO B19 ; high part of accumulator`
			`MVC B0,RILC`
			`\|\| SUB B0,2,B1 ; first ILC`
			`\|\| SUB B0,1,B2 ; const B2=N-1`
			`\|\| LDW *A5++,B6 ; ap[0]`
			`\|\| MV A0,A3 ; const A3=M`
			`.else`
			`;; This alternative is an exercise in fully unrolled Comba`
			`;; algorithm implementation that operates at n*(n+1)+12, or`
			`;; as little as 32 cycles...`
			`LDW *ARG1[0],B16 ; a[0]`
			`\|\| LDW *ARG2[0],A16 ; b[0]`
			`LDW *ARG1[1],B17 ; a[1]`
			`\|\| LDW *ARG2[1],A17 ; b[1]`
			`LDW *ARG1[2],B18 ; a[2]`
			`\|\| LDW *ARG2[2],A18 ; b[2]`
			`LDW *ARG1[3],B19 ; a[3]`
			`\|\| LDW *ARG2[3],A19 ; b[3]`
			`NOP`
			`MPY32U A16,B16,A1:A0 ; a[0]*b[0]`
			`MPY32U A17,B16,A23:A22 ; a[0]*b[1]`
			`MPY32U A16,B17,A25:A24 ; a[1]*b[0]`
			`MPY32U A16,B18,A27:A26 ; a[2]*b[0]`
			`STW A0,*ARG0[0]`
			`\|\| MPY32U A17,B17,A29:A28 ; a[1]*b[1]`
			`MPY32U A18,B16,A31:A30 ; a[0]*b[2]`
			`\|\| ADDU A22,A1,A1:A0`
			`MV A23,B0`
			`\|\| MPY32U A19,B16,A21:A20 ; a[3]*b[0]`
			`\|\| ADDU A24,A1:A0,A1:A0`
			`ADDU A25,B0,B1:B0`
			`\|\| STW A0,*ARG0[1]`
			`\|\| MPY32U A18,B17,A23:A22 ; a[2]*b[1]`
			`\|\| ADDU A26,A1,A9:A8`
			`ADDU A27,B1,B9:B8`
			`\|\| MPY32U A17,B18,A25:A24 ; a[1]*b[2]`
			`\|\| ADDU A28,A9:A8,A9:A8`
			`ADDU A29,B9:B8,B9:B8`
			`\|\| MPY32U A16,B19,A27:A26 ; a[0]*b[3]`
			`\|\| ADDU A30,A9:A8,A9:A8`
			`ADDU A31,B9:B8,B9:B8`
			`\|\| ADDU B0,A9:A8,A9:A8`
			`STW A8,*ARG0[2]`
			`\|\| ADDU A20,A9,A1:A0`
			`ADDU A21,B9,B1:B0`
			`\|\| MPY32U A19,B17,A21:A20 ; a[3]*b[1]`
			`\|\| ADDU A22,A1:A0,A1:A0`
			`ADDU A23,B1:B0,B1:B0`
			`\|\| MPY32U A18,B18,A23:A22 ; a[2]*b[2]`
			`\|\| ADDU A24,A1:A0,A1:A0`
			`ADDU A25,B1:B0,B1:B0`
			`\|\| MPY32U A17,B19,A25:A24 ; a[1]*b[3]`
			`\|\| ADDU A26,A1:A0,A1:A0`
			`ADDU A27,B1:B0,B1:B0`
			`\|\| ADDU B8,A1:A0,A1:A0`
			`STW A0,*ARG0[3]`
			`\|\| MPY32U A19,B18,A27:A26 ; a[3]*b[2]`
			`\|\| ADDU A20,A1,A9:A8`
			`ADDU A21,B1,B9:B8`
			`\|\| MPY32U A18,B19,A29:A28 ; a[2]*b[3]`
			`\|\| ADDU A22,A9:A8,A9:A8`
			`ADDU A23,B9:B8,B9:B8`
			`\|\| MPY32U A19,B19,A31:A30 ; a[3]*b[3]`
			`\|\| ADDU A24,A9:A8,A9:A8`
			`ADDU A25,B9:B8,B9:B8`
			`\|\| ADDU B0,A9:A8,A9:A8`
			`STW A8,*ARG0[4]`
			`\|\| ADDU A26,A9,A1:A0`
			`ADDU A27,B9,B1:B0`
			`\|\| ADDU A28,A1:A0,A1:A0`
			`ADDU A29,B1:B0,B1:B0`
			`\|\| BNOP RA`
			`\|\| ADDU B8,A1:A0,A1:A0`
			`STW A0,*ARG0[5]`
			`\|\| ADDU A30,A1,A9:A8`
			`ADD A31,B1,B8`
			`ADDU B0,A9:A8,A9:A8 ; removed \|\| to avoid cross-path stall below`
			`ADD B8,A9,A9`
			`\|\| STW A8,*ARG0[6]`
			`STW A9,*ARG0[7]`
			`.endif`
			`.endasmfunc`