| 
;Last Modified:  16-APR-1992 09:06:30.46
	.title  fprims  Fast Multiple Precision Primitives
	.ident  /V1.7B/
;+
; **-FPRIMS-Fast Multiple Precision Primitives
;
; Facility:     PGP
;
; Language:     Macro-32
;
; Functional Description:
;
; This module contains fast multiple precision routines for operating on arrays
; of long words. Error checking is minimised at the expense of speed.
;
; Restrictions:
;
; This code is shareable but NOT reentrant as written because of static data.
; A reentrant version of this module could be written but it would be slower!
;
; Version:      1
;
; Original:     00A     Date:   17-Sep-1991     Author: Hugh A.J. Kennedy
;
; Based on FPRIMS.ASM written by Zhahai Stewart for the Intel 8086
; architecture.
;
; Modification: 02A     Date:   27-Sep-1991     Author: Hugh A.J. Kennedy.
;
; Add fast multiply routine, P_SMUL.
; Re-organise code slightly.
; Ammend/clarify copyright and license statement.
; Add checking for maximum precision exceeded, display a warning message
; and bomb!
;
; Modification: 03A     Date:   16-Mar-1992     Author: Hugh A.J. Kennedy.
;
; Sniff for MSB in P_SMUL. In this way, avoid multiplies by leading zeroes
; (not efficient).
;
; Modification: 05A     Date:   17-Mar-1992     Author: Hugh A.J. Kennedy.
;
; Encode entire double precision multiply in VAX assembler.
; Correct some minor problems with handling embedded zeroes.
;
; Modification: 06A     Date:   17-Mar-1992     Author: Hugh A.J. Kennedy
;
; Align everything for speed. VAXen like stuff on 64-bit, or at least 32-bit
; boundaries. Therefore, we align the add, subtract and rotate tables and then
; we align the multiply loops. The extra NOPs used to pad these loops are of
; negligable cost because they already exist in the memory buffer. When the
; following instruction is aligned, it executes MUCH faster.
;
; Modification: 07A     Date:   24-Mar-1991     Author: Hugh A.J. Kennedy.
;
; Implement fast compare.
;-
	.sbttl  Copyright Notice And License To Use
;
;                 Copyright (c) 1991-1992, All Rights Reserved by
;                            Hugh A.J. Kennedy.
;
;       A license to use and adapt this software without payment is hereby 
;       granted subject to the following conditions:
;
;       1) It may only be copied with the inclusion of this copyright
;       notice in the program source with these associated conditions.
;
;       2) No title to or ownership of this software is hereby 
;       transferred.
;
;       3) The  information  in this software is subject  to change 
;       without notice and should  not be construed as a  commitment  by  
;       Hugh Kennedy.
;
;       4) The author assumes no liability for any damages arising from the 
;       use of this software, even if said damages  arises from defects in 
;       this software.
;
;       5) No warranty as to merchantability or fitness of purpose is 
;       expressed or implied.
;
;       6) Any modifications to this source must be clearly identified as
;       such and added to the modification history.
;
;       7) These routines may not be incorporated in a commercial cryptographic
;       product.
;
; If you can not comply with these conditions, you *must* contact the author
; and obtain permission other wise you are in violation of copyright.
	.sbttl  Misc Macros & Definitions
;
; Assembly Parameters
;
max_unit_prec   =       72                      ; Maximum unit precision
supersniffer    =       1                       ; Enable bit msb locator.
;
; The following parameter is dependent on the kind of VAX you are running on
; and should be defined if the execution time of the SOBGTR loop control
; instruction and the appropriate operation (ADWC or SBWC) from cache is much
; less than the execution time in main memory. If you have a slow VAX you
; should comment the following line out to use a vector of instructions.
;
novector        =       1                       ; Use loops rather than vectors.
.macro  ascid   .string
;+
; *-ASCID-Build An ASCII String Referenced By Descriptor
;
; Functional Description:
;
;       This macro is a little like the system supplied .ASCID directive
; but it uses a separate program section to store the ASCII data.
;
; Arguments:
;
;       STRING          String to create
;-
	.nocross
	.save_psect
	.psect  puret
$$$t0   =       .
	.ascii  @.string@
$$$t1   =       .-$$$t0
	.restore_psect
	.word   $$$t1
	.byte   dsc$k_dtype_t
	.byte   dsc$k_class_s
	.address -
		$$$t0
	.cross
.endm   ascid
	.sbttl  Misc Data Areas
;
; Misc. Data Areas
;
	.psect  impurd,con,lcl,noshr,exe,rd,wrt,long
;
; This data is static and is used to hold the current precision established
; by P_SETP for other calls to this library.
;
.if not_defined novector
addoff:                                         ; Offset into add table.
	.blkl   1                               ; also for sub and rot.
.endc
precis:                                         ; Precision in longwords.
	.blkl   1
	.psect  pure,con,rel,shr,exe,rd,nowrt,quad
	.align  quad
.if not_defined novector
prectoobig:
	ascid   <PGP (FPRIMS) - Requested precision (!ZL) exceeds capacity (!ZL)>
.endc
	.sbttl  Start of Code
	.sbttl  P_CMP           Compare two very long integers
;+
; **-P_COMP-Compare two very long integers
;
; Functional Description:
;
; This procedure is invoked to compare two extended precision unsigned
; integers.
;
; Calling Sequence
;
;       short P_CMP ( r1, r2)
;
; Parameters:
;
;       R1      ->      Extended Precision Integer 1
;       R2      ->      Extended Precision Integer 2
;
; Implicit Inputs:
;
;       PRECIS          lr*r    Precision expresses in longs.
;
; Returns:
;
;       -1      if r1 < r2
;        0      if r1 = r2
;       +1      if r1 > r2
;
;-
	.align  long
	.entry  p_cmp,^m<r2>
	movl    4(ap),r1                        ; R1 -> Sum.
	movl    8(ap),r2                        ; R2 -> Addend.
	movl    precis,r0                       ; R0 = Precision.
	moval   (r1)[r0],r1                     ; Get MS longwords.
	moval   (r2)[r0],r2                     ; Get MS longwords.
.align  long    1                               ; Align loop with NOPS.
10$:    cmpl    -(r1),-(r2)                     ; Compare.
	bnequ   20$                             ; If ne, then exit loop.
	sobgtr  r0,10$                          ; Loop until done.
	ret                                     ; R0 = zero so R1 = R2.
20$:
	bgtru   30$                             ; If R1 > R2 then branch.
	movw    #-1,r0                          ; Flag <.
	ret
30$:
	movw    #1,r0                           ; Flag >.
	ret
	.sbttl  P_ADDC          Add two very long integers with carry
;+
; **-P_ADDC-Add very long integers
;
; Functional Description:
;
; This procedure is invoked to add two very long integers with carry. Each
; integer is represented as an array of longwords, least significant first.
;
; Calling Sequence:
;
;       P_ADDC  sum,addend,carry
;
; Parameters:
;
;       sum             lm*r            Sum.
;       addend          lr*r            Addend.
;       carry           lr*v            Carry bit.
;
; Implicit Inputs:
;
;       Addoff          This is used as an offset into the various tables
;                       of adds, subtracts and rotates to implement the
;                       operation to the requested precsion.
;
; Status Returns:
;
;       R0      Resulting carry bit.
;-
	.align  long
	.entry  p_addc,^m<r2,r3>
	movl    4(ap),r1                        ; R1 -> Sum.
	movl    8(ap),r2                        ; R2 -> Addend.
.if defined novector
	movl    precis,r3                       ; R3 = Precision.
	subl3   12(ap),#0,r0                    ; Set carry bit.
	.align  quad,1                          ; Align loop with NOPs
10$:    adwc    (r2)+,(r1)+                     ; Add with carry one longword.
	.align  quad,1                          ; Align next instruction.
	sobgtr  r3,10$                          ; Loop until done.
.iff ; novector
	moval   10$,r3
	addl2   addoff,r3                       ; Jump into table.
	subl3   12(ap),#0,r0                    ; Set carry bit.
	jmp     (r3)
	.align  quad
10$:
	.rept   max_unit_prec
$$$     =       .
	adwc    (r2)+,(r1)+                     ; Add with carry one longword.
	nop
addsiz  =       .-$$$
	.endr
.endc ; novector
	clrl    r0                              ; Assume carry clear.
	bcc     20$                             ; Carry set?
	incl    r0                              ; Flag carry was set.
20$:    ret
	.sbttl  P_SUBB  Subtract very long integers with borrow
;+
; **-P_SUBB-Subtract very long integers
;
; Functional Description:
;
; This procedure is invoked to add subtract very long integers with carry. Each
; integer is represented as an array of longwords, least significant first.
;
; Calling Sequence:
;
;       P_SUBB  diff,sub,borrow
;
; Parameters:
;
;       diff            lm*r            Difference
;       sub             lr*r            Subtrahend.
;       borrow          lr*v            Borrow bit.
;
; Implicit Inputs:
;
;       Addoff          This is used as an offset into the various tables
;                       of adds, subtracts and rotates to implement the
;                       operation to the requested precsion.
;
; Status Returns:
;
;       R0      Resulting carry bit.
;-
	.align  long
	.entry  p_subb,^m<r2,r3>
	movl    4(ap),r1                        ; R1 -> Difference.
	movl    8(ap),r2                        ; R2 -> Minuend.
.if defined novector
	movl    precis,r3                       ; R3 = No. of longs.
	subl3   12(ap),#0,r0                    ; Set borrow bit.
	.align  quad,1                          ; Align loop with NOPs.
10$:    sbwc    (r2)+,(r1)+                     ; Subtract with borrow one long.
	.align  quad,1                          ; Align with NOPs.
	sobgtr  r3,10$                          ; Loop through.
.iff ; novector
	moval   10$,r3
	addl2   addoff,r3                       ; Jump into table.
	subl3   12(ap),#0,r0                    ; Set borrow bit.
	jmp     (r3)
	.align  quad
10$:
	.rept   max_unit_prec
	sbwc    (r2)+,(r1)+                     ; Subtract w/carry one longword.
	nop
	.endr
.endc ; novector
	clrl    r0                              ; Assume carry clear.
	bcc     20$                             ; Carry set?
	incl    r0                              ; Flag carry was set.
20$:    ret
	.sbttl  P_ROTL  Rotate left a very long integer with carry.
;+
; **-P_ROTL-Rotate left one bit very long integers
;
; Functional Description:
;
; This procedure is invoked to rotate left one bit (e.g. divide by 2) very 
; long integers with carry. Each integer is represented as an array of 
; longwords, least significant first. Note that we use the add with carry
; instruction here because the VAX (unlike the dear old PDP-11) lacks a
; rotate instruction that includes the carry bit.
;
; Calling Sequence:
;
;       P_ROTL  num,carry
;
; Parameters:
;
;       num             lm*r            Number to be shifted
;       carry           lr*v            Carry bit.
;
; Implicit Inputs:
;
;       Addoff          This is used as an offset into the various tables
;                       of adds, subtracts and rotates to implement the
;                       operation to the requested precsion.
;
; Status Returns:
;
;       R0      Resulting carry bit.
;-
	.align  long
	.entry  p_rotl,^m<r3>
	movl    4(ap),r1                        ; R1 -> Sum.
.if defined novector
	movl    precis,r3                       ; R3 = No. of longwords.
	subl3   8(ap),#0,r0                     ; Set carry bit.
	.align  quad,1                          ; Align loop with NOPs
10$:    adwc    (r1),(r1)+                      ; Add to itself with carry.
	.align  quad,1                          ; Align with NOPs.
	sobgtr  r3,10$                          ; Loop until done.
.iff ; novector
	moval   10$,r3
	addl2   addoff,r3                       ; Jump into table.
	subl3   8(ap),#0,r0                     ; Set carry bit.
	jmp     (r3)
	.align  quad
10$:
	.rept   max_unit_prec
	adwc    (r1),(r1)+                      ; *2+carry.
	nop
	.endr
.endc ; novector
	clrl    r0                              ; Assume carry clear.
	bcc     20$                             ; Carry set?
	incl    r0                              ; Flag carry was set.
20$:    ret
	.sbttl  P_DMUL  Extended Multiple Precision Multiply
;*
; **-P_DMUL-Extended Multiple Precision Multiply
;
; Functional Description:
;
; This procedure multiplies an unsigned single precision multiplier by a 
; single precision multiplicand. The product register is double precision.
; It is expected that the length of the single precision multiplier and
; multiplicand has been previously set by a call to P_SETP. Note that the
; entire length of the product register is zeroed - so it must be a full
; double precision size.
;
; Calling Sequence:
;
;       P_DMUL  prod, multiplicand, multiplier
;
; Parameters:
;
;       prod            lw*r    Product.
;       multuplicand    lr*r    Multiplicand
;       multiplier      lr*r    Multiplier
;
; Implicit Inputs:
;
;       PRECIS          lr*r    Precision expresses in longs.
;
; Status Returns:
;
;       None.
;-
	.align  long
	.entry  p_dmul,^m<r2,r3,r4,r5,r6,r7,r8,r9,r10,r11>
	movl    4(ap),r8                        ; R8 -> Product.
	beql    49$                             ; If eq, not specified.
	movl    precis,r10                      ; R10 = Precision (longs)
	ashl    #3,r10,r2                       ; R0 = No. of bytes to zero.
	movc5   #0,#0,#0,r2,(r8)                ; Zero product buffer.
	movl    8(ap),r3                        ; R3 -> Multiplicand.
	beql    49$                             ; If eq, not specified.
	pushl   r3                              ; Save for posterity.
	movl    12(ap),r11                      ; R11 -> Multiplier.
	beql    49$                             ; If eq, not specified.
	movl    r10,r12                         ; R12 = Multiplicand prec.
.if     defined SUPERSNIFFER
;
; Here we calculate the effective maximum precision for the multiply by
; locating the long containing the most significant bit of the multiplier
; and the multiplicand.
;
	moval   (r11)[r10],r0                   ; Supersniffer...
	.align  quad,1                          ; Align with nops
45$:    tstl    -(r0)                           ; Examine next long.
	bneq    50$                             ; If ne, then we found msb.
	sobgtr  r10,45$                         ; Loop until done.
49$:    ret                                     ; Multiplier = 0!
50$:
	moval   (r3)[r12],r0                    ; Supersniffer...
	.align  quad,1                          ; Align with nops
55$:    tstl    -(r0)                           ; Examine next long.
	bneq    200$                            ; If ne, then we found msb.
	sobgtr  r12,55$                         ; Loop until done.
	ret                                     ; Multiplicand = 0!
.iff
	brb     200$
49$:
	ret
.endc   ; SUPERSNIFFER
;
; Multiplier Loop
;
; R12 = Count of multiplicand longs to process.
; R11 -> Next long of multiplier.
; R10 = Count of multiplier longs to process.
; R8 -> Next long of product.
;
	.align  quad,1                          ; Align with nops
200$:   movl    r12,r5                          ; Multiplicand precision.
	moval   (r8)+,r4                        ; R4 -> Next long of product.
	movl    (sp),r3                         ; R3 -> 1st multiplier long.
	movl    (r4),r0                         ; R0,R1 = Partial Sum.
	movl    4(r4),r1
	clrl    r7                              ; Zero look-ahead carry.
;
; Perform an extended multiply of two unsigned numbers. This means that
; we have to compensate the hi-order product because either the multiplier
; or the multiplicand may be apparently a negative number. EMUL is a signed
; multiply - so we must be careful. Also, the EMUL longword addend is sign 
; extended before adding into the product so we have to add the hard way.
;
; R6 =          Current Multiplicand
; R2 =          Multiplier
; R4 ->         Current quadword of partial product.
; R0,R1 =       Partial sum to which product is added
; R7 =          Lookahead carry. This gets set if we try to carry after adding
;               the partial product to the partial sum. This gets a little more
;               complicated because here we are setting the high-order long of
;               the next quadword to be operated on.
;
; Essentially the algorithm is as follows:
;
; 0) R0,R1 = (R4)               ; Save current partial sum.
; 1) R6 = Next longword of multiplicand.
; 2) (R4) = R6 * R2             ; quad result compensating for negative numbers)
; 3) (R4) = (R4) + R0,R1        ; add back partial sum.
; 4) R7 = Carry bit.
; 5) R4 = R4 + 4                ; Point to next long.
; 6) R1 = 4(R4) + R7            ; Propagate carry to high order of next partial
;                               ; sum.
; 7) Loop back to step 1 until multiplicand completely processed.
;
	movl    (r11)+,r2                       ; R2 = Multiplier.
	beql    999$                            ; If eq, not specified.
	blss    1500$                           ; This unfolds the compensation
						; test out of the loop.
;
; This version of the multiply loop is entered when the multiplier is positive
; saving three instructions per unit of precision.
;
	.align  quad,1                          ; Align with NOPs.
500$:   movl    (r3)+,r6                        ; R6 = Current multplicand.
	emul    r2,r6,#0,(r4)                   ; Multiply (64-bit result).
;
; Because we have removed leading zeroes, multiplication by zero is very
; unlikely, 1 in 2^32 or so. It is therefore easier to perform the test after
; the EMUL (looking at the zero product) that the multiplicand was zero so we 
; don't need any special case logic later to adjust the product pointer.
;
	beql    550$                            ; If result eq, skip.
	tstl    r6                              ; Was multiplicand negative?
	bgeq    550$                            ; No, skip.
	addl2   r2,4(r4)                        ; Yes, compensate.
550$:   addl2   r0,(r4)+                        ; Accumulate.
	adwc    r1,(r4)+
	movl    (r4),r1                         ; R1 = Next hi-end partial sum.
	adwc    r7,r1                           ; Add carry if needed.
	clrl    r7                              ; Reset lookahead register.
	adwc    #0,r7                           ; Save lookahead carry.
	movl    -(r4),r0                        ; R0 = Next lo-end partial.
	sobgtr  r5,500$                         ; More units?
999$:   sobgtr  r10,200$                        ; Nope, go get next multiplier
	ret
;
; This version of the above multiply loop is entered when the multiplier is
; negative - and we must compensate by adding the multiplicand to the hi-order
; product. This saves a test and a conditional branch per unit of precision.
;
	.align  quad,1                          ; Align with NOPs.
1500$:
	movl    (r3)+,r6                        ; R6 = Current multplicand.
	emul    r2,r6,#0,(r4)                   ; Multiply (64-bit result).
;
; Because we have removed leading zeroes, multiplication by zero is very
; unlikely, 1 in 2^32 or so. It is therefore easier to perform the test after
; the EMUL (looking at the zero product) that the multiplicand was zero so we 
; don't need any special case logic later to adjust the product pointer.
;
	beql    1560$                           ; If result eq, skip.
	tstl    r6                              ; Was multiplicand negative?
	bgeq    1550$                           ; No, skip.
	addl2   r2,4(r4)                        ; Yes, compensate.
1550$:
; As documented above, we unfolded the following to save instructions
;       tstl    r2                              ; Multiplier negative?
;       bgeq    1560$                           ; No, skip.
	addl2   r6,4(r4)                        ; Yes, compensate.
1560$:  addl2   r0,(r4)+                        ; Accumulate.
	adwc    r1,(r4)+                        ; R1 = High-end partial sum.
	movl    (r4),r1                         ; R1 = Next hi-end partial sum.
	adwc    r7,r1                           ; Add carry if needed.
	clrl    r7                              ; Reset lookahead register.
	adwc    #0,r7                           ; Save lookahead carry.
	movl    -(r4),r0                        ; R0 = Next lo-end partial.
	sobgtr  r5,1500$                        ; More units?
	sobgtr  r10,200$                        ; Nope, go get next multiplier
	ret
	.sbttl  P_SETP          Set Precison.
;+
; **-P_SETP-Set Precision
;
; Functional Description:
;
; This procedure is invoked to set the operating precision of the package.
;
; Calling Sequence:
;
;       P_SETP  nbits
;
; Parameters:
;
;       nbits           rw*v    Number of bits in number.
;
; Implicit Outputs:
;
;       Precis          Set to the number of longwords required to implement
;                       the requested precision.
;       Addoff          This is used as an offset into the various tables
;                       of adds, subtracts and rotates to implement the
;                       operation to the requested precsion.
;
; Status Returns:
;
;       None.
;
; Side Effects:
;
;       If the maximum precision set in 32-bit units by the assembly
;       parameter "max_unit_prec" is exceeded, a message to that effect will
;       be displayed and the program will terminate with a fatal error.
;-
	.entry  p_setp,^m<>
	movzwl  4(ap),r1                        ; R1 = No. of bits.
	addl2   #31,r1                          ; Round up to next long word.
	ashl    #-5,r1,r1                       ; R1 = No. of 32 bit words.
	movl    r1,precis                       ; Save precision.
.if not_defined novector
	subl3   r1,#max_unit_prec,r0            ; R0 = Number of steps reqd.
	blss    10$                             ; If > 0 then exit.
	mull3   #addsiz,r0,addoff               ; Get add table offset.
.iftf ; novector
	ret
.ift ; novector
10$:                                            ; Table size exceeded!
	movab   -80(sp),sp                      ; Output buffer.
	pushab  (sp)                            ; Build descriptor
	movzwl  #80,-(sp)
	clrl    -(sp)                           ; Receive return length.
	pushl   #max_unit_prec                  ; Compiled max table size.
	pushl   r1                              ; Requested table size.
	pushaq  8+4(sp)                         ; -> Output buffer descriptor.
	pushaw  12(sp)                          ; -> Returned length.
	pushaq  prectoobig                      ; -> FAO control string.
	calls   #5,g^sys$fao                    ; Format output string.
	movl    (sp)+,(sp)                      ; Set actual buffer size.
	pushaq  (sp)                            ; -> Output buffer descr.
	calls   #1,g^lib$put_output             ; Output message.
	$exit_s -                               ; Exit with severe error.
		code=#4
.endc ; novector
	.end
 |