; bignuma.asm

; based on:
; bbignuma.asm - asm routines for bignumbers
; Wesley Loewer's Big Numbers.        (C) 1994-95, Wesley B. Loewer
; based pointer version

; See BIGLIB.TXT for further documentation.

; general programming notes for bases pointer version
; ALL big_t pointers must have a segment value equal to bignum_seg.
; single arg procedures, p(r), r = bx (or si when required)
; two arg procedures,    p(r,n), r=di, n=bx(or si when required)
; two arg procedures,    p(n1,n2), n1=bx(or si when required), n2=di
; three arg proc,        p(r,n1,n2), r=di, n1=si, n2=bx
; unless otherwise noted, such as full_mult, mult, full_square, square

.MODEL medium, c

include big.inc
include bigport.inc

.DATA

.CODE
.8086

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r = 0
clear_bn   PROC USES di, r:bn_t

        mov     cx, bnlength
        mov     di, word ptr r
        mov     es, bignum_seg          ; load pointer in es:di

IFDEF BIG16AND32
        cmp     cpu, 386                ; check cpu
        jae     short use_32_bit        ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        sub     ax, ax                  ; clear ax
        shr     cx, 1                   ; 1 byte = 1/2 word
        rep     stosw                   ; clear r, word at a time
ENDIF

IFDEF BIG16AND32
        jmp     bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        sub     eax, eax                ; clear eax
        shr     cx, 2                   ; 1 byte = 1/4 word
        rep     stosd                   ; clear r, dword at a time
ENDIF

bottom:
.8086
        mov     ax, word ptr r          ; return r in ax
        ret

clear_bn   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r = max positive value
max_bn   PROC USES di, r:bn_t

        mov     cx, bnlength
        mov     di, word ptr r
        mov     es, bignum_seg          ; load pointer in es:di

IFDEF BIG16AND32
        cmp     cpu, 386                ; check cpu
        jae     short use_32_bit        ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        mov     ax, 0FFFFh              ; set ax to max value
        shr     cx, 1                   ; 1 byte = 1/2 word
        rep     stosw                   ; max out r, word at a time
ENDIF

IFDEF BIG16AND32
        jmp     bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        mov     eax, 0FFFFFFFFh         ; set eax to max value
        shr     cx, 2                   ; 1 byte = 1/4 word
        rep     stosd                   ; max out r, dword at a time
ENDIF

bottom:
.8086
        ; when the above stos is finished, di points to the byte past the end
        mov     byte ptr es:[di-1], 7Fh       ; turn off the sign bit

        mov     ax, word ptr r              ; return r in ax
        ret

max_bn   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r = n
copy_bn   PROC USES di si, r:bn_t, n:bn_t

        mov     ax, ds                  ; save ds for later
        mov     cx, bnlength
        mov     di, word ptr r
        mov     es, bignum_seg          ; load pointer in es:di
        mov     si, word ptr n

IFDEF BIG16AND32
        cmp     cpu, 386                ; check cpu
        jae     short use_32_bit        ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        mov     ds, bignum_seg          ; load pointer in ds:si for movs

        shr     cx, 1                   ; 1 byte = 1/2 word
        rep     movsw                   ; copy word at a time
ENDIF

IFDEF BIG16AND32
        jmp     bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        mov     ds, bignum_seg          ; load pointer in ds:si for movs

        shr     cx, 2                   ; 1 byte = 1/4 word
        rep     movsd                   ; copy dword at a time
ENDIF

bottom:
.8086
        mov     ds, ax                  ; restore ds
        mov     ax, word ptr r          ; return r in ax
        ret

copy_bn   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; n1 != n2 ?
; RETURNS: if n1 == n2 returns 0
;          if n1 > n2 returns a positive (steps left to go when mismatch occured)
;          if n1 < n2 returns a negative (steps left to go when mismatch occured)
cmp_bn   PROC USES di, n1:bn_t, n2:bn_t

        push    ds                      ; save DS
        mov     cx, bnlength
        mov     dx, cx                  ; save bnlength for later comparison
        mov     di, word ptr n2         ; load n2 pointer in di
        mov     bx, word ptr n1         ; load n1 pointer in bx

        add     bx, cx                  ; point to end of bignumbers
        add     di, cx                  ; where the msb is

IFDEF BIG16AND32
        cmp     cpu, 386                ; check cpu
        jae     short use_32_bit        ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        mov     ds, bignum_seg          ; load ds
        shr     cx, 1                   ; byte = 1/2 word
top_loop_16:
        sub     bx, 2                   ; decrement to previous word
        sub     di, 2
        mov     ax, ds:[bx]             ; load n1
        cmp     ax, ds:[di]             ; compare to n2
        jne     not_match_16            ; don't match
        loop    top_loop_16
        jmp     match                   ; cx is zero
not_match_16:
        ; now determine which byte of the two did not match
        shl     cx, 1                   ; convert back to bytes
        cmp     ah, ds:[di+1]           ; compare to n2
        jne     bottom                  ; jump if ah doesn't match
        ; if ah does match, then mismatch was in al
        dec     cx                      ; decrement cx by 1 to show match
        jmp     bottom

ENDIF

IFDEF BIG32
use_32_bit:
.386
        mov     ds, bignum_seg          ; load ds
        shr     cx, 2                   ; byte = 1/4 dword
top_loop_32:
        sub     bx, 4                   ; decrement to previous dword
        sub     di, 4
        mov     eax, ds:[bx]            ; load n1
        cmp     eax, ds:[di]            ; compare to n2
        jne     not_match_32            ; don't match
        loop    top_loop_32
        jmp     match                   ; cx is zero
not_match_32:
        ; now determine which byte of the four did not match
        shl     cx, 2                   ; convert back to bytes
        mov     ebx, eax
        shr     ebx, 16                 ; shift ebx_high to bx
        cmp     bh, ds:[di+3]           ; compare to n2
        jne     bottom                  ; jump if bh doesn't match
        dec     cx                      ; decrement cx by 1 to show match
        cmp     bl, ds:[di+2]           ; compare to n2
        jne     bottom                  ; jump if bl doesn't match
        dec     cx                      ; decrement cx by 1 to show match
        cmp     ah, ds:[di+1]           ; compare to n2
        jne     bottom                  ; jump if ah doesn't match
        ; if bh,bl,ah do match, then mismatch was in al
        dec     cx                      ; decrement cx by 1 to show match
        jmp     bottom

ENDIF

bottom:
.8086
; flags are still set from last cmp
; if cx == dx, then most significant part didn't match, use signed comparison
; else the decimals didn't match, use unsigned comparison
        lahf                            ; load results of last cmp
        cmp     cx, dx                  ; did they differ on very first cmp
        jne     not_first_step          ; no

        sahf                            ; yes
        jg      n1_bigger               ; signed comparison
        jmp     n2_bigger

not_first_step:
        sahf
        ja      n1_bigger               ; unsigned comparison

n2_bigger:
        neg     cx                      ; make it negative
n1_bigger:                              ; leave it positive
match:                                  ; leave it zero
        mov     ax, cx
        pop     ds                      ; restore DS
        ret

cmp_bn   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r < 0 ?
; returns 1 if negative, 0 if positive or zero
is_bn_neg   PROC n:bn_t

        ; for a one-pass routine like this, don't bother with ds
        mov     bx, word ptr n
        mov     es, bignum_seg              ; load n pointer in es:bx

        add     bx, bnlength                ; find sign bit
        mov     al, es:[bx-1]               ; got it

        and     al, 80h                     ; check the sign bit
        rol     al, 1                       ; rotate sign big to bit 0
        sub     ah, ah                      ; clear upper ax
        ret

is_bn_neg   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; n != 0 ?
; RETURNS: if n != 0 returns 1
;          else returns 0
is_bn_not_zero   PROC n:bn_t

        mov     ax, ds                  ; save DS
        mov     cx, bnlength
        mov     bx, word ptr n

IFDEF BIG16AND32
        cmp     cpu, 386                ; check cpu
        jae     short use_32_bit        ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        mov     ds, bignum_seg          ; load n pointer in ds:bx
        shr     cx, 1                   ; byte = 1/2 word
top_loop_16:
        cmp     word ptr ds:[bx], 0     ; compare to n to 0
        jnz     bottom                  ; not zero
        add     bx, 2                   ; increment to next word
        loop    top_loop_16
ENDIF

IFDEF BIG16AND32
        jmp     bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        mov     ds, bignum_seg          ; load n pointer in ds:bx
        shr     cx, 2                   ; byte = 1/4 dword
top_loop_32:
        cmp     dword ptr ds:[bx], 0    ; compare to n to 0
        jnz     bottom                  ; not zero
        add     bx, 4                   ; increment to next dword
        loop    top_loop_32
        jmp     bottom
ENDIF

bottom:
.8086
        mov     ds, ax                  ; restore DS
        ; if cx is zero, then n was zero
        mov     ax, cx
        ret

is_bn_not_zero   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r = n1 + n2
add_bn   PROC USES di si, r:bn_t, n1:bn_t, n2:bn_t

        mov     dx, ds                  ; save ds
        mov     cx, bnlength
        mov     di, WORD PTR r
        mov     si, WORD PTR n1
        mov     bx, WORD PTR n2


IFDEF BIG16AND32
        cmp     cpu, 386                ; check cpu
        jae     short use_32_bit        ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        mov     ds, bignum_seg          ; load ds

        shr     cx, 1                   ; byte = 1/2 word
        clc                             ; clear carry flag

top_loop_16:
        mov     ax, ds:[si]             ; n1
        adc     ax, ds:[bx]             ; n1+n2
        mov     ds:[di], ax             ; r = n1+n2

                                        ; inc does not change carry flag
        inc     di                      ; add  di, 2
        inc     di
        inc     si                      ; add  si, 2
        inc     si
        inc     bx                      ; add  bx, 2
        inc     bx

        loop    top_loop_16

ENDIF

IFDEF BIG16AND32
        jmp     short bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        mov     ds, bignum_seg          ; load ds

        shr     cx, 2                   ; byte = 1/4 double word
        clc                             ; clear carry flag

top_loop_32:
        mov     eax, ds:[si]            ; n1
        adc     eax, ds:[bx]            ; n1+n2
        mov     ds:[di], eax            ; r = n1+n2

        lahf                            ; save carry flag
        add     di, 4                   ; increment by double word size
        add     si, 4
        add     bx, 4
        sahf                            ; restore carry flag

        loop    top_loop_32
ENDIF

bottom:
.8086
        mov     ds, dx                  ; restore ds
        mov     ax, word ptr r          ; return r in ax
        ret
add_bn   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r += n
add_a_bn   PROC USES di, r:bn_t, n:bn_t

        mov     dx, ds                  ; save ds
        mov     cx, bnlength
        mov     di, WORD PTR r
        mov     bx, WORD PTR n

IFDEF BIG16AND32
        cmp     cpu, 386                ; check cpu
        jae     short use_32_bit        ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        mov     ds, bignum_seg          ; load ds

        shr     cx, 1                   ; byte = 1/2 word
        clc                             ; clear carry flag

top_loop_16:
        mov     ax, ds:[bx]             ; n
        adc     ds:[di], ax             ; r += n

                                        ; inc does not change carry flag
        inc     di                      ; add  di, 2
        inc     di
        inc     bx                      ; add  di, 2
        inc     bx

        loop    top_loop_16
ENDIF

IFDEF BIG16AND32
        jmp     short bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        mov     ds, bignum_seg          ; load ds

        shr     cx, 2                   ; byte = 1/4 double word
        clc                             ; clear carry flag

top_loop_32:
        mov     eax, ds:[bx]            ; n
        adc     ds:[di], eax            ; r += n

        lahf                            ; save carry flag
        add     di, 4                   ; increment by double word size
        add     bx, 4
        sahf                            ; restore carry flag

        loop    top_loop_32
ENDIF

bottom:
.8086
        mov     ds, dx                  ; restore ds
        mov     ax, word ptr r          ; return r in ax
        ret
add_a_bn   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r = n1 - n2
sub_bn   PROC USES di si, r:bn_t, n1:bn_t, n2:bn_t

        mov     dx, ds                  ; save ds
        mov     cx, bnlength
        mov     di, WORD PTR r
        mov     si, WORD PTR n1
        mov     bx, WORD PTR n2


IFDEF BIG16AND32
        cmp     cpu, 386                ; check cpu
        jae     short use_32_bit        ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        mov     ds, bignum_seg          ; load ds

        shr     cx, 1                   ; byte = 1/2 word
        clc                             ; clear carry flag

top_loop_16:
        mov     ax, ds:[si]             ; n1
        sbb     ax, ds:[bx]             ; n1-n2
        mov     ds:[di], ax             ; r = n1-n2

                                        ; inc does not change carry flag
        inc     di                      ; add  di, 2
        inc     di
        inc     si                      ; add  si, 2
        inc     si
        inc     bx                      ; add  bx, 2
        inc     bx

        loop    top_loop_16
ENDIF

IFDEF BIG16AND32
        jmp     short bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        mov     ds, bignum_seg          ; load ds

        shr     cx, 2                   ; byte = 1/4 double word
        clc                             ; clear carry flag

top_loop_32:
        mov     eax, ds:[si]            ; n1
        sbb     eax, ds:[bx]            ; n1-n2
        mov     ds:[di], eax            ; r = n1-n2

        lahf                            ; save carry flag
        add     di, 4                   ; increment by double word size
        add     si, 4
        add     bx, 4
        sahf                            ; restore carry flag

        loop    top_loop_32
ENDIF

bottom:
.8086

        mov     ds, dx                  ; restore ds
        mov     ax, word ptr r          ; return r in ax
        ret
sub_bn   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r -= n
sub_a_bn   PROC USES di, r:bn_t, n:bn_t

        mov     dx, ds                  ; save ds
        mov     cx, bnlength
        mov     di, WORD PTR r
        mov     bx, WORD PTR n

IFDEF BIG16AND32
        cmp     cpu, 386                ; check cpu
        jae     short use_32_bit        ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        mov     ds, bignum_seg          ; load ds

        shr     cx, 1                   ; byte = 1/2 word
        clc                             ; clear carry flag

top_loop_16:
        mov     ax, ds:[bx]             ; n
        sbb     ds:[di], ax             ; r -= n

                                        ; inc does not change carry flag
        inc     di                      ; add  di, 2
        inc     di
        inc     bx                      ; add  di, 2
        inc     bx

        loop    top_loop_16
ENDIF

IFDEF BIG16AND32
        jmp     short bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        mov     ds, bignum_seg          ; load ds

        shr     cx, 2                   ; byte = 1/4 double word
        clc                             ; clear carry flag

top_loop_32:
        mov     eax, ds:[bx]            ; n
        sbb     ds:[di], eax            ; r -= n

        lahf                            ; save carry flag
        add     di, 4                   ; increment by double word size
        add     bx, 4
        sahf                            ; restore carry flag

        loop    top_loop_32

ENDIF

bottom:
.8086

        mov     ds, dx                  ; restore ds
        mov     ax, word ptr r          ; return r in ax
        ret
sub_a_bn   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r = -n
neg_bn   PROC USES di, r:bn_t, n:bn_t

        mov     dx, ds                  ; save ds
        mov     cx, bnlength
        mov     di, WORD PTR r
        mov     bx, WORD PTR n

IFDEF BIG16AND32
        cmp     cpu, 386
        jae     short use_32_bit        ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        mov     ds, bignum_seg          ; load ds

        shr     cx, 1                   ; byte = 1/2 word

top_loop_16:
        mov     ax, ds:[bx]
        neg     ax
        mov     ds:[di], ax
        jc      short no_more_carry_16  ; notice the "reverse" logic here

        add     di, 2                   ; increment by word size
        add     bx, 2

        loop    top_loop_16
        jmp     short bottom

no_more_carry_16:
        add     di, 2
        add     bx, 2
        loop    top_loop_no_more_carry_16   ; jump down
        jmp     short bottom

top_loop_no_more_carry_16:
        mov     ax, ds:[bx]
        not     ax
        mov     ds:[di], ax

        add     di, 2
        add     bx, 2

        loop    top_loop_no_more_carry_16
ENDIF

IFDEF BIG16AND32
        jmp     short bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        mov     ds, bignum_seg          ; load ds

        shr     cx, 2                   ; byte = 1/4 dword

top_loop_32:
        mov     eax, ds:[bx]
        neg     eax
        mov     ds:[di], eax
        jc      short no_more_carry_32   ; notice the "reverse" logic here

        add     di, 4                   ; increment by double word size
        add     bx, 4

        loop    top_loop_32
        jmp     short bottom

no_more_carry_32:
        add     di, 4                   ; increment by double word size
        add     bx, 4
        loop    top_loop_no_more_carry_32   ; jump down
        jmp     short bottom

top_loop_no_more_carry_32:
        mov     eax, ds:[bx]
        not     eax
        mov     ds:[di], eax

        add     di, 4                   ; increment by double word size
        add     bx, 4

        loop    top_loop_no_more_carry_32
ENDIF

bottom:
.8086

        mov     ds, dx                  ; restore ds
        mov     ax, word ptr r          ; return r in ax
        ret
neg_bn   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r *= -1
neg_a_bn   PROC r:bn_t

        mov     ax, ds                  ; save ds
        mov     cx, bnlength
        mov     bx, WORD PTR r

IFDEF BIG16AND32
        cmp     cpu, 386
        jae     short use_32_bit        ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        mov     ds, bignum_seg          ; load ds
        shr     cx, 1                   ; byte = 1/2 word

top_loop_16:
        neg     word ptr ds:[bx]
        jc      short no_more_carry_16  ; notice the "reverse" logic here

        add     bx, 2

        loop    top_loop_16
        jmp     short bottom

no_more_carry_16:
        add     bx, 2
        loop    top_loop_no_more_carry_16   ; jump down
        jmp     short bottom

top_loop_no_more_carry_16:
        not     word ptr ds:[bx]

        add     bx, 2

        loop    top_loop_no_more_carry_16
ENDIF

IFDEF BIG16AND32
        jmp     short bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        mov     ds, bignum_seg          ; load ds
        shr     cx, 2                   ; byte = 1/4 dword

top_loop_32:
        neg     dword ptr ds:[bx]
        jc      short no_more_carry_32   ; notice the "reverse" logic here

        add     bx, 4

        loop    top_loop_32
        jmp     short bottom

no_more_carry_32:
        add     bx, 4
        loop    top_loop_no_more_carry_32   ; jump down
        jmp     short bottom

top_loop_no_more_carry_32:
        not     dword ptr ds:[bx]

        add     bx, 4

        loop    top_loop_no_more_carry_32
ENDIF

bottom:
.8086
        mov     ds, ax                  ; restore ds
        mov     ax, word ptr r          ; return r in ax
        ret
neg_a_bn   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r = 2*n
double_bn   PROC USES di, r:bn_t, n:bn_t

        mov     dx, ds                  ; save ds
        mov     cx, bnlength
        mov     di, WORD PTR r
        mov     bx, WORD PTR n

IFDEF BIG16AND32
        cmp     cpu, 386
        jae     short use_32_bit        ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        mov     ds, bignum_seg          ; load ds

        shr     cx, 1                   ; byte = 1/2 word
        clc

top_loop_16:
        mov     ax, ds:[bx]
        rcl     ax, 1                   ; rotate with carry left
        mov     ds:[di], ax

                                        ; inc does not change carry flag
        inc     di                      ; add  di, 2
        inc     di
        inc     bx                      ; add bx, 2
        inc     bx

        loop    top_loop_16
ENDIF

IFDEF BIG16AND32
        jmp     short bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        mov     ds, bignum_seg          ; load ds

        shr     cx, 2                   ; byte = 1/4 dword
        clc                             ; clear carry flag

top_loop_32:
        mov     eax, ds:[bx]
        rcl     eax, 1                  ; rotate with carry left
        mov     ds:[di], eax

        lahf                            ; save carry flag
        add     di, 4                   ; increment by double word size
        add     bx, 4
        sahf                            ; restore carry flag

        loop    top_loop_32

ENDIF
bottom:
.8086

        mov     ds, dx                  ; restore ds
        mov     ax, word ptr r          ; return r in ax
        ret
double_bn   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r *= 2
double_a_bn   PROC r:bn_t

        mov     ax, ds                  ; save ds
        mov     cx, bnlength
        mov     bx, WORD PTR r

IFDEF BIG16AND32
        cmp     cpu, 386
        jae     short use_32_bit        ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        mov     ds, bignum_seg          ; load ds

        shr     cx, 1                   ; byte = 1/2 word
        clc

top_loop_16:
        rcl     word ptr ds:[bx], 1     ; rotate with carry left

                                        ; inc does not change carry flag
        inc     bx                      ; add  bx, 2
        inc     bx

        loop    top_loop_16
ENDIF

IFDEF BIG16AND32
        jmp     short bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        mov     ds, bignum_seg          ; load ds

        shr     cx, 2                   ; byte = 1/4 dword
        clc                             ; clear carry flag

top_loop_32:
        rcl     dword ptr ds:[bx], 1    ; rotate with carry left

        inc     bx                      ; add bx, 4 but keep carry flag
        inc     bx
        inc     bx
        inc     bx

        loop    top_loop_32
ENDIF

bottom:
.8086

        mov     ds, ax                  ; restore ds
        mov     ax, word ptr r          ; return r in ax
        ret
double_a_bn   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r = n/2
half_bn   PROC USES di, r:bn_t, n:bn_t

        mov     dx, ds                  ; save ds
        mov     cx, bnlength
        mov     di, WORD PTR r
        mov     bx, WORD PTR n

        add     di, cx                  ; start with msb
        add     bx, cx

IFDEF BIG16AND32
        cmp     cpu, 386
        jae     short use_32_bit        ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        mov     ds, bignum_seg          ; load ds

        shr     cx, 1                   ; byte = 1/2 word

        ; handle the first step with sar, the rest with rcr
        sub     di, 2
        sub     bx, 2

        mov     ax, ds:[bx]
        sar     ax, 1                   ; shift arithmetic right
        mov     ds:[di], ax

        loop    top_loop_16
        jmp     short bottom


top_loop_16:
                                        ; inc does not change carry flag
        dec     di                      ; sub  di, 2
        dec     di
        dec     bx                      ; sub bx, 2
        dec     bx

        mov     ax, ds:[bx]
        rcr     ax, 1                   ; rotate with carry right
        mov     ds:[di], ax

        loop    top_loop_16
ENDIF

IFDEF BIG16AND32
        jmp     short bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        mov     ds, bignum_seg          ; load ds

        shr     cx, 2                   ; byte = 1/4 dword

        sub     di, 4                   ; decrement by double word size
        sub     bx, 4

        mov     eax, ds:[bx]
        sar     eax, 1                  ; shift arithmetic right
        mov     ds:[di], eax

        loop    top_loop_32
        jmp     short bottom

top_loop_32:
        lahf                            ; save carry flag
        sub     di, 4                   ; decrement by double word size
        sub     bx, 4
        sahf                            ; restore carry flag

        mov     eax, ds:[bx]
        rcr     eax, 1                  ; rotate with carry right
        mov     ds:[di], eax

        loop    top_loop_32
ENDIF

bottom:
.8086

        mov     ds, dx                  ; restore ds
        mov     ax, word ptr r          ; return r in ax
        ret
half_bn   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r /= 2
half_a_bn   PROC r:bn_t

        mov     ax, ds                  ; save ds
        mov     cx, bnlength
        mov     bx, WORD PTR r

        add     bx, cx                  ; start with msb


IFDEF BIG16AND32
        cmp     cpu, 386
        jae     short use_32_bit        ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        mov     ds, bignum_seg          ; load ds

        shr     cx, 1                   ; byte = 1/2 word

        ; handle the first step with sar, the rest with rcr
        sub     bx, 2

        sar     word ptr ds:[bx], 1     ; shift arithmetic right

        loop    top_loop_16
        jmp     short bottom


top_loop_16:
                                        ; inc does not change carry flag
        dec     bx                      ; sub bx, 2
        dec     bx

        rcr     word ptr ds:[bx], 1     ; rotate with carry right

        loop    top_loop_16
ENDIF

IFDEF BIG16AND32
        jmp     short bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        mov     ds, bignum_seg          ; load ds

        shr     cx, 2                   ; byte = 1/4 dword
        sub     bx, 4                   ; decrement by double word size
        sar     dword ptr ds:[bx], 1    ; shift arithmetic right

        loop    top_loop_32
        jmp     short bottom

top_loop_32:
        dec     bx                      ; sub bx, 4 but keep carry flag
        dec     bx
        dec     bx
        dec     bx

        rcr     dword ptr ds:[bx], 1       ; rotate with carry right

        loop    top_loop_32
ENDIF

bottom:
.8086

        mov     ds, ax                  ; restore ds
        mov     ax, word ptr r          ; return r in ax
        ret
half_a_bn   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r = n1 * n2
; Note: r will be a double wide result, 2*bnlength
;       n1 and n2 can be the same pointer
; SIDE-EFFECTS: n1 and n2 are changed to their absolute values
;
unsafe_full_mult_bn   PROC USES di si, r:bn_t, n1:bn_t, n2:bn_t
LOCAL sign1:byte, sign2:byte, samevar:byte, \
      i:word, j:word, steps:word, doublesteps:word, carry_steps:word, \
      n1p: near ptr byte, n2p: near ptr byte

        push    ds                          ; save ds
        mov     es, bignum_seg              ; load es for when ds is a pain

; Test to see if n1 and n2 are the same variable.  It would be better to
; use square_bn(), but it could happen.

        mov     samevar, 0                  ; assume they are not the same
        mov     bx, word ptr n1
        cmp     bx, word ptr n2             ; compare offset
        jne     end_samevar_check           ; not the same
        mov     samevar, 1                  ; they are the same
end_samevar_check:

; By forcing the bignumber to be positive and keeping track of the sign
; bits separately, quite a few multiplies are saved.

                                            ; check for sign bits
        add     bx, bnlength
        mov     al, es:[bx-1]
        and     al, 80h                     ; check the sign bit
        mov     sign1, al
        jz      already_pos1
        invoke  neg_a_bn, n1
already_pos1:

        cmp     samevar, 1                  ; if it's the same variable
        je      already_pos2                ; then skip this second check
        mov     bx, word ptr n2
        add     bx, bnlength
        mov     al, es:[bx-1]
        and     al, 80h                     ; check the sign bit
        mov     sign2, al
        jz      already_pos2
        invoke  neg_a_bn, n2
already_pos2:

; in the following loops, the following pointers are used
;   n1p, n2p = points to the part of n1, n2 being used
;   di = points to part of doublebignumber r used in outer loop
;   si = points to part of doublebignumber r used in inner loop
;   bx = points to part of doublebignumber r for carry flag loop
; Also, since r is used more than n1p or n2p, abandon the convention of
; using ES for r.  Using DS will save a few clock cycles.

IFDEF BIG16AND32
        cmp     cpu, 386                ; check cpu
;        jae     use_32_bit              ; use faster 32 bit code if possible
        jb      wont_use_32bit
        jmp     use_32_bit              ; use faster 32 bit code if possible
wont_use_32bit:
ENDIF

IFDEF BIG16
        ; set variables
        mov     dx, bnlength            ; set outer loop counter
        shr     dx, 1                   ; byte = 1/2 word
        mov     steps, dx               ; save in steps
        mov     i, dx
        shl     dx, 1                   ; double steps

        ; clear r
        sub     ax, ax                  ; clear ax
        mov     cx, dx                  ; size of doublebignumber (r) in words
        mov     di, word ptr r          ; load r in es:di for stos
        rep     stosw                   ; initialize r to 0

        sub     dx, 2                   ; only 2*s-2 steps are really needed
        mov     doublesteps, dx
        mov     carry_steps, dx

        ; prepare segments and offsets for loops
        mov     di, word ptr r
        mov     si, di                  ; both si and di are used here
        mov     ds, bignum_seg          ; load ds
        mov     ax, word ptr n1         ; load pointers
        mov     n1p, ax
        ; use ds for all pointers

top_outer_loop_16:
        mov     ax, word ptr n2         ; set n2p pointer
        mov     n2p, ax
        mov     ax, steps               ; set inner loop counter
        mov     j, ax

top_inner_loop_16:
        mov     bx, n1p
        mov     ax, ds:[bx]
        mov     bx, n2p
        mul     word ptr ds:[bx]

        mov     bx, si
        add     bx, 2                   ; increase by size of word
        add     ds:[bx-2], ax           ; add low word
        adc     ds:[bx], dx             ; add high word
        jnc     no_more_carry_16        ; carry loop not necessary

        mov     cx, carry_steps         ; how many till end of double big number
        jcxz    no_more_carry_16
        add     bx, 2                   ; move pointer to next word

        ; loop until no more carry or until end of double big number
top_carry_loop_16:
        add     word ptr ds:[bx], 1     ; use add, not inc
        jnc     no_more_carry_16
        add     bx, 2                   ; increase by size of word
        loop    top_carry_loop_16

no_more_carry_16:
        add     n2p, 2                  ; increase by word size
        add     si, 2
        dec     carry_steps             ; use one less step
        dec     j
        ja      top_inner_loop_16

        add     n1p, 2                  ; increase by word size
        add     di, 2
        mov     si, di                  ; start with si=di

        dec     doublesteps             ; reduce the carry steps needed
        mov     ax, doublesteps
        mov     carry_steps, ax


        dec     i
        ja      top_outer_loop_16

        ; result is now r, a double wide bignumber
ENDIF

IFDEF BIG16AND32
        jmp     bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        ; set variables
        mov     dx, bnlength            ; set outer loop counter
        shr     dx, 2                   ; byte = 1/4 dword
        mov     steps, dx               ; save in steps
        mov     i, dx
        shl     dx, 1                   ; double steps

        ; clear r
        sub     eax, eax                ; clear eax
        mov     cx, dx                  ; size of doublebignumber in dwords
        mov     di, word ptr r          ; load r in es:di for stos
        rep     stosd                   ; initialize r to 0

        sub     dx, 2                   ; only 2*s-2 steps are really needed
        mov     doublesteps, dx
        mov     carry_steps, dx

        ; prepare segments and offsets for loops
        mov     di, word ptr r
        mov     si, di                  ; both si and di are used here
        mov     ds, bignum_seg          ; load ds
        mov     ax, word ptr n1         ; load pointers
        mov     n1p, ax

top_outer_loop_32:
        mov     ax, word ptr n2         ; set n2p pointer
        mov     n2p, ax
        mov     ax, steps               ; set inner loop counter
        mov     j, ax

top_inner_loop_32:
        mov     bx, n1p
        mov     eax, ds:[bx]
        mov     bx, n2p
        mul     dword ptr ds:[bx]

        mov     bx, si
        add     bx, 4                   ; increase by size of dword
        add     ds:[bx-4], eax          ; add low dword
        adc     ds:[bx], edx            ; add high dword
        jnc     no_more_carry_32        ; carry loop not necessary

        mov     cx, carry_steps         ; how many till end of double big number
        jcxz    no_more_carry_32
        add     bx, 4                   ; move pointer to next dword

        ; loop until no more carry or until end of double big number
top_carry_loop_32:
        add     dword ptr ds:[bx], 1    ; use add, not inc
        jnc     no_more_carry_32
        add     bx, 4                   ; increase by size of dword
        loop    top_carry_loop_32

no_more_carry_32:
        add     n2p, 4                  ; increase by dword size
        add     si, 4
        dec     carry_steps             ; use one less step
        dec     j
        ja      top_inner_loop_32

        add     n1p, 4                  ; increase by dword size
        add     di, 4
        mov     si, di                  ; start with si=di

        dec     doublesteps             ; reduce the carry steps needed
        mov     ax, doublesteps
        mov     carry_steps, ax


        dec     i
        ja      top_outer_loop_32

        ; result is now r, a double wide bignumber
ENDIF

bottom:
.8086

        pop     ds                      ; restore ds
        cmp     samevar, 1              ; were the variable the same ones?
        je      pos_answer              ; if yes, then jump

        mov     al, sign1               ; is result + or - ?
        cmp     al, sign2               ; sign(n1) == sign(n2) ?
        je      pos_answer              ; yes
        shl     bnlength, 1             ; temporarily double bnlength
                                        ; for double wide bignumber
        invoke  neg_a_bn, r             ; does not affect ES
        shr     bnlength, 1             ; restore bnlength
pos_answer:

        mov     ax, word ptr r          ; return r in ax
        ret
unsafe_full_mult_bn   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r = n1 * n2 calculating only the top rlength bytes
; Note: r will be of length rlength
;       2*bnlength <= rlength < bnlength
;       n1 and n2 can be the same pointer
; SIDE-EFFECTS: n1 and n2 are changed to their absolute values
;
unsafe_mult_bn   PROC USES di si, r:bn_t, n1:bn_t, n2:bn_t
LOCAL sign1:byte, sign2:byte, samevar:byte, \
      i:word, j:word, steps:word, doublesteps:word, \
      carry_steps:word, skips:word, \
      n1p: ptr byte, n2p: ptr byte

        push    ds                          ; save ds
        mov     es, bignum_seg              ; load es for when ds is a pain

; Test to see if n1 and n2 are the same variable.  It would be better to
; use square_bn(), but it could happen.

        mov     samevar, 0                  ; assume they are not the same
        mov     bx, word ptr n1
        cmp     bx, word ptr n2             ; compare offset
        jne     end_samevar_check           ; not the same
        mov     samevar, 1                  ; they are the same
end_samevar_check:

; By forcing the bignumber to be positive and keeping track of the sign
; bits separately, quite a few multiplies are saved.

                                            ; check for sign bits
        add     bx, bnlength
        mov     al, es:[bx-1]
        and     al, 80h                     ; check the sign bit
        mov     sign1, al
        jz      already_pos1
        invoke  neg_a_bn, n1
already_pos1:

        cmp     samevar, 1                  ; if it's the same variable
        je      already_pos2                ; then skip this second check
        mov     bx, word ptr n2
        add     bx, bnlength
        mov     al, es:[bx-1]
        and     al, 80h                     ; check the sign bit
        mov     sign2, al
        jz      already_pos2
        invoke  neg_a_bn, n2
already_pos2:

        ; adjust n2 pointer for partial precision
        mov     ax, bnlength
        shl     ax, 1                   ; 2*bnlength
        sub     ax, rlength             ; 2*bnlength-rlength
        add     word ptr n2, ax         ; n2 = n2+2*bnlength-rlength


; in the following loops, the following pointers are used
;   n1p, n2p = points to the part of n1, n2 being used
;   di = points to part of doublebignumber used in outer loop
;   si = points to part of doublebignumber used in inner loop
;   bx = points to part of doublebignumber for carry flag loop
; Also, since r is used more than n1p or n2p, abandon the convention of
; using ES for r.  Using DS will save a few clock cycles.

IFDEF BIG16AND32
        cmp     cpu, 386                ; check cpu
;        jae     use_32_bit              ; use faster 32 bit code if possible
        jb      cant_use_32bit
        jmp     use_32_bit              ; use faster 32 bit code if possible
cant_use_32bit:
ENDIF

IFDEF BIG16
        ; clear r
        sub     ax, ax                  ; clear ax
        mov     cx, rlength             ; size of r in bytes
        shr     cx, 1                   ; byte = 1/2 word
        mov     di, word ptr r          ; load r in es:di for stos
        rep     stosw                   ; initialize r to 0

        ; set variables
        mov     ax, rlength             ; set steps for first loop
        sub     ax, bnlength
        shr     ax, 1                   ; byte = 1/2 word
        mov     steps, ax               ; save in steps

        mov     ax, bnlength
        shr     ax, 1                   ; byte = 1/2 word
        mov     i, ax

        sub     ax, steps
        mov     skips, ax               ; how long to skip over pointer shifts

        mov     ax, rlength             ; set steps for first loop
        shr     ax, 1                   ; byte = 1/2 word
        sub     ax, 2                   ; only rlength/2-2 steps are really needed
        mov     doublesteps, ax
        mov     carry_steps, ax

        ; prepare segments and offsets for loops
        mov     di, word ptr r
        mov     si, di                  ; both si and di are used here
        mov     ds, bignum_seg          ; load ds
        mov     ax, word ptr n1         ; load pointers
        mov     n1p, ax
        ; use ds for all pointers


top_outer_loop_16:
        mov     ax, word ptr n2         ; set n2p pointer
        mov     n2p, ax
        mov     ax, steps               ; set inner loop counter
        mov     j, ax

top_inner_loop_16:
        mov     bx, n1p
        mov     ax, ds:[bx]
        mov     bx, n2p
        mul     word ptr ds:[bx]

        mov     bx, si
        add     bx, 2                   ; increase by size of word
        add     ds:[bx-2], ax           ; add low word
        adc     ds:[bx], dx             ; add high word
        jnc     no_more_carry_16        ; carry loop not necessary

        mov     cx, carry_steps         ; how many till end of double big number
        jcxz    no_more_carry_16
        add     bx, 2                   ; move pointer to next word

        ; loop until no more carry or until end of double big number
top_carry_loop_16:
        add     word ptr ds:[bx], 1     ; use add, not inc
        jnc     no_more_carry_16
        add     bx, 2                   ; increase by size of word
        loop    top_carry_loop_16

no_more_carry_16:
        add     n2p, 2                  ; increase by word size
        add     si, 2
        dec     carry_steps             ; use one less step
        dec     j
        ja      top_inner_loop_16

        add     n1p, 2                  ; increase by word size

        cmp     skips, 0
        je      type2_shifts_16
        sub     word ptr n2, 2          ; shift n2 back a word
        inc     steps                   ; one more step this time
        ; leave di and doublesteps where they are
        dec     skips                   ; keep track of how many times we've done this
        jmp     shifts_bottom_16
type2_shifts_16:
        add     di, 2                   ; shift di forward a word
        dec     doublesteps             ; reduce the carry steps needed
shifts_bottom_16:
        mov     si, di                  ; start with si=di
        mov     ax, doublesteps
        mov     carry_steps, ax

        dec     i
        ja      top_outer_loop_16

        ; result is in r
ENDIF

IFDEF BIG16AND32
        jmp     bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386

        ; clear r
        sub     eax, eax                ; clear eax
        mov     cx, rlength             ; size of r in bytes
        shr     cx, 2                   ; byte = 1/4 dword
        mov     di, word ptr r          ; load r in es:di for stos
        rep     stosd                   ; initialize r to 0

        ; set variables
        mov     ax, rlength             ; set steps for first loop
        sub     ax, bnlength
        shr     ax, 2                   ; byte = 1/4 dword
        mov     steps, ax               ; save in steps

        mov     ax, bnlength
        shr     ax, 2                   ; byte = 1/4 dword
        mov     i, ax

        sub     ax, steps
        mov     skips, ax               ; how long to skip over pointer shifts

        mov     ax, rlength             ; set steps for first loop
        shr     ax, 2                   ; byte = 1/4 dword
        sub     ax, 2                   ; only rlength/4-2 steps are really needed
        mov     doublesteps, ax
        mov     carry_steps, ax

        ; prepare segments and offsets for loops
        mov     di, word ptr r
        mov     si, di                  ; both si and di are used here
        mov     ds, bignum_seg          ; load ds
        mov     ax, word ptr n1         ; load pointers
        mov     n1p, ax


top_outer_loop_32:
        mov     ax, word ptr n2         ; set n2p pointer
        mov     n2p, ax
        mov     ax, steps               ; set inner loop counter
        mov     j, ax

top_inner_loop_32:
        mov     bx, n1p
        mov     eax, ds:[bx]
        mov     bx, n2p
        mul     dword ptr ds:[bx]

        mov     bx, si
        add     bx, 4                   ; increase by size of dword
        add     ds:[bx-4], eax          ; add low dword
        adc     ds:[bx], edx            ; add high dword
        jnc     no_more_carry_32        ; carry loop not necessary

        mov     cx, carry_steps         ; how many till end of double big number
        jcxz    no_more_carry_32
        add     bx, 4                   ; move pointer to next dword

        ; loop until no more carry or until end of r
top_carry_loop_32:
        add     dword ptr ds:[bx], 1    ; use add, not inc
        jnc     no_more_carry_32
        add     bx, 4                   ; increase by size of dword
        loop    top_carry_loop_32

no_more_carry_32:
        add     n2p, 4                  ; increase by dword size
        add     si, 4
        dec     carry_steps             ; use one less step
        dec     j
        ja      top_inner_loop_32

        add     n1p, 4                  ; increase by dword size

        cmp     skips, 0
        je      type2_shifts_32
        sub     word ptr n2, 4          ; shift n2 back a dword
        inc     steps                   ; one more step this time
        ; leave di and doublesteps where they are
        dec     skips                   ; keep track of how many times we've done this
        jmp     shifts_bottom_32
type2_shifts_32:
        add     di, 4                   ; shift di forward a dword
        dec     doublesteps             ; reduce the carry steps needed
shifts_bottom_32:
        mov     si, di                  ; start with si=di
        mov     ax, doublesteps
        mov     carry_steps, ax

        dec     i
        ja      top_outer_loop_32

        ; result is in r
ENDIF

bottom:
.8086
        pop     ds                      ; restore ds
        cmp     samevar, 1              ; were the variable the same ones?
        je      pos_answer              ; if yes, then jump

        mov     al, sign1               ; is result + or - ?
        cmp     al, sign2               ; sign(n1) == sign(n2) ?
        je      pos_answer              ; yes
        push    bnlength                ; save bnlength
        mov     ax, rlength
        mov     bnlength, ax            ; set bnlength = rlength
        invoke  neg_a_bn, r             ; does not affect ES
        pop     bnlength                ; restore bnlength
pos_answer:

        mov     ax, word ptr r          ; return r in ax
        ret
unsafe_mult_bn   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r = n^2
;   because of the symetry involved, n^2 is much faster than n*n
;   for a bignumber of length l
;      n*n takes l^2 multiplications
;      n^2 takes (l^2+l)/2 multiplications
;          which is about 1/2 n*n as l gets large
;  uses the fact that (a+b+c+...)^2 = (a^2+b^2+c^2+...)+2(ab+ac+bc+...)
;
; Note: r will be a double wide result, 2*bnlength
; SIDE-EFFECTS: n is changed to its absolute value
;
unsafe_full_square_bn   PROC USES di si, r:bn_t, n:bn_t
LOCAL i:word, j:word, steps:word, doublesteps:word, carry_steps:word, \
      save_ds:word, \
      rp1: ptr byte, rp2: ptr byte

        mov     save_ds, ds                 ; save ds
        mov     es, bignum_seg              ; load es for when ds is a pain

; By forcing the bignumber to be positive and keeping track of the sign
; bits separately, quite a few multiplies are saved.

                                            ; check for sign bit
        mov     bx, word ptr n
        add     bx, bnlength
        mov     al, es:[bx-1]
        and     al, 80h                     ; check the sign bit
        jz      already_pos
        invoke  neg_a_bn, n
already_pos:

; in the following loops, the following pointers are used
;   n1p(di), n2p(si) = points to the parts of n being used (es)
;   rp1 = points to part of doublebignumber used in outer loop  (ds)
;   rp2 = points to part of doublebignumber used in inner loop  (ds)
;   bx  = points to part of doublebignumber for carry flag loop (ds)

        mov     cx, bnlength            ; size of doublebignumber in words

IFDEF BIG16AND32
        cmp     cpu, 386                ; check cpu
;        jae     use_32_bit              ; use faster 32 bit code if possible
        jb      dont_use_32bit
        jmp     use_32_bit              ; use faster 32 bit code if possible
dont_use_32bit:
ENDIF

IFDEF BIG16
        ; clear r
        sub     ax, ax                  ; clear ax
        ; 2{twice the size}*bnlength/2{bytes per word}
        mov     di, word ptr r          ; load r pointer in es:di for stos
        rep     stosw                   ; initialize r to 0

        ; initialize vars
        mov     dx, bnlength            ; set outer loop counter
        shr     dx, 1                   ; byte = 1/2 word
        dec     dx                      ; don't need to do last one
        mov     i, dx                   ; loop counter
        mov     steps, dx               ; save in steps
        shl     dx, 1                   ; double steps
        sub     dx, 1                   ; only 2*s-1 steps are really needed
        mov     doublesteps, dx
        mov     carry_steps, dx

        ; initialize pointers
        mov     di, word ptr n
        mov     ax, word ptr r
        mov     ds, bignum_seg          ; load ds
        add     ax, 2                   ; start with second word
        mov     rp1, ax
        mov     rp2, ax                 ; start with rp2=rp1

        cmp     i, 0                    ; if bignumberlength is 2
        je      skip_middle_terms_16

top_outer_loop_16:
        mov     si, di                  ; set n2p pointer
        add     si, 2                   ; to 1 word beyond n1p(di)
        mov     ax, steps               ; set inner loop counter
        mov     j, ax

top_inner_loop_16:
        mov     ax, ds:[di]
        mul     word ptr ds:[si]

        mov     bx, rp2
        add     bx, 2                   ; increase by size of word
        add     ds:[bx-2], ax           ; add low word
        adc     ds:[bx], dx             ; add high word
        jnc     no_more_carry_16        ; carry loop not necessary

        mov     cx, carry_steps         ; how many till end of double big number
        jcxz    no_more_carry_16
        add     bx, 2                   ; move pointer to next word

        ; loop until no more carry or until end of double big number
top_carry_loop_16:
        add     word ptr ds:[bx], 1     ; use add, not inc
        jnc     no_more_carry_16
        add     bx, 2                   ; increase by size of word
        loop    top_carry_loop_16

no_more_carry_16:
        add     si, 2                   ; increase by word size
        add     rp2, 2
        dec     carry_steps             ; use one less step
        dec     j
        ja      top_inner_loop_16

        add     di, 2                   ; increase by word size
        add     rp1, 4                  ; increase by 2*word size
        mov     ax, rp1
        mov     rp2, ax                 ; start with rp2=rp1

        sub     doublesteps,2           ; reduce the carry steps needed
        mov     ax, doublesteps
        mov     carry_steps, ax

        dec     steps                   ; use one less step
        dec     i
        ja      top_outer_loop_16

        ; All the middle terms have been multiplied.  Now double it.
        mov     ds, save_ds             ; restore ds to get bnlength
        shl     bnlength, 1             ; r is a double wide bignumber
        invoke  double_a_bn, r          ; doesn't change es
        shr     bnlength, 1             ; restore r

skip_middle_terms_16:                   ; ds is not necessarily restored here

; Now go back and add in the squared terms.
; In the following loops, the following pointers are used
;   n1p(di) = points to the parts of n being used (es)
;   rp1(si) = points to part of doublebignumber used in outer loop  (ds)
;   bx      = points to part of doublebignumber for carry flag loop (ds)

        mov     di, word ptr n          ; load n1p pointer in di

        mov     ds, save_ds             ; restore ds to get bnlength
        mov     dx, bnlength            ; set outer loop counter
        shr     dx, 1                   ; 1 bytes = 1/2 word
        mov     i, dx                   ; loop counter
        shl     dx, 1                   ; double steps

        sub     dx, 2                   ; only 2*s-2 steps are really needed
        mov     doublesteps, dx
        mov     carry_steps, dx
        mov     si, word ptr r          ; set rp1
        mov     ds, bignum_seg          ; load ds


top_outer_loop_squares_16:

        mov     ax, ds:[di]
        mul     ax                      ; square it

        mov     bx, si
        add     bx, 2                   ; increase by size of word
        add     ds:[bx-2], ax           ; add low word
        adc     ds:[bx], dx             ; add high word
        jnc     no_more_carry_squares_16 ; carry loop not necessary

        mov     cx, carry_steps         ; how many till end of double big number
        jcxz    no_more_carry_squares_16
        add     bx, 2                   ; move pointer to next word

        ; loop until no more carry or until end of double big number
top_carry_loop_squares_16:
        add     word ptr ds:[bx], 1     ; use add, not inc
        jnc     no_more_carry_squares_16
        add     bx, 2                   ; increase by size of word
        loop    top_carry_loop_squares_16

no_more_carry_squares_16:
        add     di, 2                   ; increase by word size
        add     si, 4                   ; increase by 2*word size

        sub     doublesteps,2           ; reduce the carry steps needed
        mov     ax, doublesteps
        mov     carry_steps, ax

        dec     i
        ja      top_outer_loop_squares_16


        ; result is in r, a double wide bignumber
ENDIF

IFDEF BIG16AND32
        jmp     bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        ; clear r
        sub     eax, eax                ; clear eax
        ; 2{twice the size}*bnlength/4{bytes per word}
        shr     cx, 1                   ; size of doublebignumber in dwords
        mov     di, word ptr r          ; load r pointer in es:di for stos
        rep     stosd                   ; initialize r to 0

        ; initialize vars
        mov     dx, bnlength            ; set outer loop counter
        shr     dx, 2                   ; byte = 1/4 dword
        dec     dx                      ; don't need to do last one
        mov     i, dx                   ; loop counter
        mov     steps, dx               ; save in steps
        shl     dx, 1                   ; double steps
        sub     dx, 1                   ; only 2*s-1 steps are really needed
        mov     doublesteps, dx
        mov     carry_steps, dx

        ; initialize pointers
        mov     di, word ptr n          ; load n1p pointer
        mov     ax, word ptr r
        mov     ds, bignum_seg          ; load ds

        add     ax, 4                   ; start with second dword
        mov     rp1, ax
        mov     rp2, ax                 ; start with rp2=rp1

        cmp     i, 0                    ; if bignumberlength is 4
        je      skip_middle_terms_32

top_outer_loop_32:
        mov     si, di                  ; set n2p pointer
        add     si, 4                   ; to 1 dword beyond n1p(di)
        mov     ax, steps               ; set inner loop counter
        mov     j, ax

top_inner_loop_32:
        mov     eax, ds:[di]
        mul     dword ptr ds:[si]

        mov     bx, rp2
        add     bx, 4                   ; increase by size of dword
        add     ds:[bx-4], eax          ; add low dword
        adc     ds:[bx], edx            ; add high dword
        jnc     no_more_carry_32        ; carry loop not necessary

        mov     cx, carry_steps         ; how many till end of double big number
        jcxz    no_more_carry_32
        add     bx, 4                   ; move pointer to next dword

        ; loop until no more carry or until end of double big number
top_carry_loop_32:
        add     dword ptr ds:[bx], 1    ; use add, not inc
        jnc     no_more_carry_32
        add     bx, 4                   ; increase by size of dword
        loop    top_carry_loop_32

no_more_carry_32:
        add     si, 4                   ; increase by dword size
        add     rp2, 4
        dec     carry_steps             ; use one less step
        dec     j
        ja      top_inner_loop_32

        add     di, 4                   ; increase by dword size
        add     rp1, 8                  ; increase by 2*dword size
        mov     ax, rp1
        mov     rp2, ax                 ; start with rp2=rp1

        sub     doublesteps,2           ; reduce the carry steps needed
        mov     ax, doublesteps
        mov     carry_steps, ax

        dec     steps                   ; use one less step
        dec     i
        ja      top_outer_loop_32

        ; All the middle terms have been multiplied.  Now double it.
        mov     ds, save_ds             ; restore ds to get bnlength
        shl     bnlength, 1             ; r is a double wide bignumber
        invoke  double_a_bn, r
        shr     bnlength, 1             ; restore r

skip_middle_terms_32:                   ; ds is not necessarily restored here

; Now go back and add in the squared terms.
; In the following loops, the following pointers are used
;   n1p(di) = points to the parts of n being used (es)
;   rp1(si) = points to part of doublebignumber used in outer loop (ds)
;   bx = points to part of doublebignumber for carry flag loop     (ds)

        mov     di, word ptr n          ; load n1p pointer in ds:di

        mov     ds, save_ds             ; restore ds to get bnlength
        mov     dx, bnlength            ; set outer loop counter
        shr     dx, 2                   ; 1 bytes = 1/4 dword
        mov     i, dx                   ; loop counter
        shl     dx, 1                   ; double steps

        sub     dx, 2                   ; only 2*s-2 steps are really needed
        mov     doublesteps, dx
        mov     carry_steps, dx
        mov     si, word ptr r          ; set rp1
        mov     ds, bignum_seg          ; load ds

top_outer_loop_squares_32:

        mov     eax, ds:[di]
        mul     eax                     ; square it

        mov     bx, si
        add     bx, 4                   ; increase by size of dword
        add     ds:[bx-4], eax          ; add low dword
        adc     ds:[bx], edx            ; add high dword
        jnc     no_more_carry_squares_32 ; carry loop not necessary

        mov     cx, carry_steps         ; how many till end of double big number
        jcxz    no_more_carry_squares_32
        add     bx, 4                   ; move pointer to next dword

        ; loop until no more carry or until end of double big number
top_carry_loop_squares_32:
        add     dword ptr ds:[bx], 1    ; use add, not inc
        jnc     no_more_carry_squares_32
        add     bx, 4                   ; increase by size of dword
        loop    top_carry_loop_squares_32

no_more_carry_squares_32:
        add     di, 4                   ; increase by dword size
        add     si, 8                   ; increase by 2*dword size

        sub     doublesteps,2           ; reduce the carry steps needed
        mov     ax, doublesteps
        mov     carry_steps, ax

        dec     i
        ja      top_outer_loop_squares_32


        ; result is in r, a double wide bignumber
ENDIF

bottom:
.8086

; since it is a square, the result has to already be positive

        mov     ds, save_ds             ; restore ds
        mov     ax, word ptr r          ; return r in ax
        ret
unsafe_full_square_bn   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r = n^2
;   because of the symetry involved, n^2 is much faster than n*n
;   for a bignumber of length l
;      n*n takes l^2 multiplications
;      n^2 takes (l^2+l)/2 multiplications
;          which is about 1/2 n*n as l gets large
;  uses the fact that (a+b+c+...)^2 = (a^2+b^2+c^2+...)+2(ab+ac+bc+...)
;
; Note: r will be of length rlength
;       2*bnlength >= rlength > bnlength
; SIDE-EFFECTS: n is changed to its absolute value
;
unsafe_square_bn   PROC USES di si, r:bn_t, n:bn_t
LOCAL i:word, j:word, steps:word, doublesteps:word, carry_steps:word, \
      skips:word, rodd:word, \
      save_ds:word, \
      n3p: ptr byte, \
      rp1: ptr byte, rp2: ptr byte

; This whole procedure would be a great deal simpler if we could assume that
; rlength < 2*bnlength (that is, not =).  Therefore, we will take the
; easy way out and call full_square_bn() if it is.
        mov     ax, rlength
        shr     ax, 1                   ; 1/2 * rlength
        cmp     ax, bnlength            ; 1/2 * rlength == bnlength?
        jne     not_full_square
        invoke  unsafe_full_square_bn, r, n
        ; dx:ax is still loaded with return value
        jmp     quit_proc               ; we're outa here
not_full_square:

        mov     save_ds, ds
        mov     es, bignum_seg              ; load es for when ds is a pain

; By forcing the bignumber to be positive and keeping track of the sign
; bits separately, quite a few multiplies are saved.

                                            ; check for sign bit
        mov     bx, word ptr n              ; load n1 pointer in es:bx
        add     bx, bnlength
        mov     al, es:[bx-1]
        and     al, 80h                     ; check the sign bit
        jz      already_pos
        invoke  neg_a_bn, n
already_pos:

; in the following loops, the following pointers are used
;   n1p(di), n2p(si) = points to the parts of n being used (es)
;   rp1 = points to part of doublebignumber used in outer loop  (ds)
;   rp2 = points to part of doublebignumber used in inner loop  (ds)
;   bx  = points to part of doublebignumber for carry flag loop (ds)

IFDEF BIG16AND32
        cmp     cpu, 386                ; check cpu
;        jae     use_32_bit              ; use faster 32 bit code if possible
        jb      skip_use_32bit
        jmp     use_32_bit              ; use faster 32 bit code if possible
skip_use_32bit:
ENDIF

IFDEF BIG16
        ; clear r
        sub     ax, ax                  ; clear ax
        mov     cx, rlength             ; size of rlength in bytes
        shr     cx, 1                   ; byte = 1/2 word
        mov     di, word ptr r          ; load r pointer in es:di for stos
        rep     stosw                   ; initialize r to 0


        ; initialize vars

        ; determine whether r is on an odd or even word in the number
        ; (even if rlength==2*bnlength, dec r alternates odd/even)
        mov     ax, bnlength
        shl     ax, 1                   ; double wide width
        sub     ax, rlength             ; 2*bnlength-rlength
        shr     ax, 1                   ; 1 byte = 1/2 word
        and     ax, 0001h               ; check the odd sign bit
        mov     rodd, ax

        mov     ax, bnlength            ; set outer loop counter
        shr     ax, 1                   ; byte = 1/2 word
        dec     ax                      ; don't need to do last one
        mov     i, ax                   ; loop counter

        mov     ax, rlength             ; set steps for first loop
        sub     ax, bnlength
        shr     ax, 1                   ; byte = 1/2 word
        mov     steps, ax               ; save in steps

        mov     dx, bnlength
        shr     dx, 1                   ; bnlength/2
        add     ax, dx                  ; steps+bnlength/2
        sub     ax, 2                   ; steps+bnlength/2-2
        mov     doublesteps, ax
        mov     carry_steps, ax

        mov     ax, i
        sub     ax, steps
        shr     ax, 1                   ; for both words and dwords
        mov     skips, ax               ; how long to skip over pointer shifts

        ; initialize pointers
        mov     di, word ptr n
        mov     si, di
        mov     ax, bnlength
        shr     ax, 1                   ; 1 byte = 1/2 word
        sub     ax, steps
        shl     ax, 1                   ; 1 byte = 1/2 word
        add     si, ax                  ; n2p = n1p + 2*(bnlength/2 - steps)
        mov     n3p, si                 ; save for later use
        mov     ax, word ptr r
        mov     ds, bignum_seg          ; load ds
        mov     rp1, ax
        mov     rp2, ax                 ; start with rp2=rp1

        cmp     i, 0                    ; if bignumberlength is 2
;        je      skip_middle_terms_16
        jne     top_outer_loop_16
        jmp     skip_middle_terms_16

top_outer_loop_16:
        mov     ax, steps               ; set inner loop counter
        mov     j, ax

top_inner_loop_16:
        mov     ax, ds:[di]
        mul     word ptr ds:[si]

        mov     bx, rp2
        add     bx, 2                   ; increase by size of word
        add     ds:[bx-2], ax           ; add low word
        adc     ds:[bx], dx             ; add high word
        jnc     no_more_carry_16        ; carry loop not necessary

        mov     cx, carry_steps         ; how many till end of double big number
        jcxz    no_more_carry_16
        add     bx, 2                   ; move pointer to next word

        ; loop until no more carry or until end of double big number
top_carry_loop_16:
        add     word ptr ds:[bx], 1     ; use add, not inc
        jnc     no_more_carry_16
        add     bx, 2                   ; increase by size of word
        loop    top_carry_loop_16

no_more_carry_16:
        add     si, 2                   ; increase by word size
        add     rp2, 2
        dec     carry_steps             ; use one less step
        dec     j
        ja      top_inner_loop_16

        add     di, 2                   ; increase by word size

        mov     ax, rodd                ; whether r is on an odd or even word

        cmp     skips, 0
        jle     type2_shifts_16
        sub     n3p, 2                  ; point to previous word
        mov     si, n3p
        inc     steps                   ; one more step this time
        ; leave rp1 and doublesteps where they are
        dec     skips
        jmp     shifts_bottom_16
type2_shifts_16:    ; only gets executed once
        jl      type3_shifts_16
        sub     steps, ax               ; steps -= (0 or 1)
        inc     ax                      ; ax = 1 or 2 now
        sub     doublesteps, ax         ; decrease double steps by 1 or 2
        shl     ax, 1                   ; 1 byte = 1/2 word
        add     rp1, ax                 ; add 1 or 2 words
        mov     si, di
        add     si, 2                   ; si = di + word
        dec     skips                   ; make skips negative
        jmp     shifts_bottom_16
type3_shifts_16:
        dec     steps
        sub     doublesteps, 2
        add     rp1, 4                  ; + two words
        mov     si, di
        add     si, 2                   ; si = di + word
shifts_bottom_16:

        mov     ax, rp1
        mov     rp2, ax                 ; start with rp2=rp1

        mov     ax, doublesteps
        mov     carry_steps, ax

        dec     i
;        ja      top_outer_loop_16
        jna     not_top_outer_loop_16
        jmp     top_outer_loop_16
not_top_outer_loop_16:
        ; All the middle terms have been multiplied.  Now double it.
        mov     ds, save_ds             ; restore ds to get bnlength
        push    bnlength                ; save bnlength
        mov     ax, rlength
        mov     bnlength, ax            ; r is of length rlength
        invoke  double_a_bn, r
        pop     bnlength

skip_middle_terms_16:
; Now go back and add in the squared terms.
; In the following loops, the following pointers are used
;   n1p(di) = points to the parts of n being used (es)
;   rp1(si) = points to part of doublebignumber used in outer loop (ds)
;   bx = points to part of doublebignumber for carry flag loop     (ds)

        ; be careful, the next dozen or so lines are confusing!

        ; determine whether r is on an odd or even word in the number
        mov     ax, bnlength
        shl     ax, 1                   ; double wide width
        sub     ax, rlength             ; 2*bnlength-rlength
        mov     dx, ax                  ; save this for a moment
        and     ax, 0002h               ; check the odd sign bit

        mov     si, word ptr r          ; load r pointer in ds:si
        add     si, ax                  ; depending on odd or even byte

        shr     dx, 1                   ; assumes word size
        inc     dx
        and     dx, 0FFFEh              ; ~2+1, turn off last bit, mult of 2
        mov     di, word ptr n          ; load n1p pointer in di
                                        ; es is still set from before
        add     di, dx

        mov     ax, bnlength
        sub     ax, dx
        shr     ax, 1                   ; 1 byte = 1/2 word
        mov     i, ax

        shl     ax, 1                   ; double steps
        sub     ax, 2                   ; only 2*s-2 steps are really needed
        mov     doublesteps, ax
        mov     carry_steps, ax

        mov     ds, bignum_seg          ; load ds

top_outer_loop_squares_16:

        mov     ax, ds:[di]
        mul     ax                      ; square it

        mov     bx, si
        add     bx, 2                   ; increase by size of word
        add     ds:[bx-2], ax           ; add low word
        adc     ds:[bx], dx             ; add high word
        jnc     no_more_carry_squares_16 ; carry loop not necessary

        mov     cx, carry_steps         ; how many till end of double big number
        jcxz    no_more_carry_squares_16
        add     bx, 2                   ; move pointer to next word

        ; loop until no more carry or until end of double big number
top_carry_loop_squares_16:
        add     word ptr ds:[bx], 1     ; use add, not inc
        jnc     no_more_carry_squares_16
        add     bx, 2                   ; increase by size of word
        loop    top_carry_loop_squares_16

no_more_carry_squares_16:
        add     di, 2                   ; increase by word size
        add     si, 4                   ; increase by 2*word size

        sub     doublesteps,2           ; reduce the carry steps needed
        mov     ax, doublesteps
        mov     carry_steps, ax

        dec     i
        ja      top_outer_loop_squares_16


        ; result is in r
ENDIF

IFDEF BIG16AND32
        jmp     bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        ; clear r
        sub     eax, eax                ; clear eax
        mov     cx, rlength             ; size of rlength in bytes
        shr     cx, 2                   ; byte = 1/4 dword
        mov     di, word ptr r          ; load r pointer in es:di for stos
        rep     stosd                   ; initialize r to 0

        ; initialize vars

        ; determine whether r is on an odd or even dword in the number
        ; (even if rlength==2*bnlength, dec r alternates odd/even)
        mov     ax, bnlength
        shl     ax, 1                   ; double wide width
        sub     ax, rlength             ; 2*bnlength-rlength
        shr     ax, 2                   ; 1 byte = 1/4 dword
        and     ax, 0001h               ; check the odd sign bit
        mov     rodd, ax

        mov     ax, bnlength            ; set outer loop counter
        shr     ax, 2                   ; byte = 1/4 dword
        dec     ax                      ; don't need to do last one
        mov     i, ax                   ; loop counter

        mov     ax, rlength             ; set steps for first loop
        sub     ax, bnlength
        shr     ax, 2                   ; byte = 1/4 dword
        mov     steps, ax               ; save in steps

        mov     dx, bnlength
        shr     dx, 2                   ; bnlength/4
        add     ax, dx                  ; steps+bnlength/4
        sub     ax, 2                   ; steps+bnlength/4-2
        mov     doublesteps, ax
        mov     carry_steps, ax

        mov     ax, i
        sub     ax, steps
        shr     ax, 1                   ; for both words and dwords
        mov     skips, ax               ; how long to skip over pointer shifts

        ; initialize pointers
        mov     di, word ptr n          ; load n1p pointer
        mov     si, di
        mov     ax, bnlength
        shr     ax, 2                   ; 1 byte = 1/4 dword
        sub     ax, steps
        shl     ax, 2                   ; 1 byte = 1/4 dword
        add     si, ax                  ; n2p = n1p + bnlength/4 - steps
        mov     n3p, si                 ; save for later use
        mov     ax, word ptr r
        mov     ds, bignum_seg          ; load ds
        mov     rp1, ax
        mov     rp2, ax                 ; start with rp2=rp1

        cmp     i, 0                    ; if bignumberlength is 2
        je      skip_middle_terms_32

top_outer_loop_32:
        mov     ax, steps               ; set inner loop counter
        mov     j, ax

top_inner_loop_32:
        mov     eax, ds:[di]
        mul     dword ptr ds:[si]

        mov     bx, rp2
        add     bx, 4                   ; increase by size of dword
        add     ds:[bx-4], eax          ; add low dword
        adc     ds:[bx], edx            ; add high dword
        jnc     no_more_carry_32        ; carry loop not necessary

        mov     cx, carry_steps         ; how many till end of double big number
        jcxz    no_more_carry_32
        add     bx, 4                   ; move pointer to next dword

        ; loop until no more carry or until end of double big number
top_carry_loop_32:
        add     dword ptr ds:[bx], 1    ; use add, not inc
        jnc     no_more_carry_32
        add     bx, 4                   ; increase by size of dword
        loop    top_carry_loop_32

no_more_carry_32:
        add     si, 4                   ; increase by dword size
        add     rp2, 4
        dec     carry_steps             ; use one less step
        dec     j
        ja      top_inner_loop_32

        add     di, 4                   ; increase by dword size

        mov     ax, rodd                ; whether r is on an odd or even dword

        cmp     skips, 0
        jle     type2_shifts_32
        sub     n3p, 4                  ; point to previous dword
        mov     si, n3p
        inc     steps                   ; one more step this time
        ; leave rp1 and doublesteps where they are
        dec     skips
        jmp     shifts_bottom_32
type2_shifts_32:    ; only gets executed once
        jl      type3_shifts_32
        sub     steps, ax               ; steps -= (0 or 1)
        inc     ax                      ; ax = 1 or 2 now
        sub     doublesteps, ax         ; decrease double steps by 1 or 2
        shl     ax, 2                   ; 1 byte = 1/4 dword
        add     rp1, ax                 ; add 1 or 2 dwords
        mov     si, di
        add     si, 4                   ; si = di + dword
        dec     skips                   ; make skips negative
        jmp     shifts_bottom_32
type3_shifts_32:
        dec     steps
        sub     doublesteps, 2
        add     rp1, 8                  ; + two dwords
        mov     si, di
        add     si, 4                   ; si = di + dword
shifts_bottom_32:

        mov     ax, rp1
        mov     rp2, ax                 ; start with rp2=rp1

        mov     ax, doublesteps
        mov     carry_steps, ax

        dec     i
        ja      top_outer_loop_32

        ; All the middle terms have been multiplied.  Now double it.
        mov     ds, save_ds             ; restore ds to get bnlength
        push    bnlength                ; save bnlength
        mov     ax, rlength
        mov     bnlength, ax            ; r is of length rlength
        invoke  double_a_bn, r
        pop     bnlength

skip_middle_terms_32:
; Now go back and add in the squared terms.
; In the following loops, the following pointers are used
;   n1p(di) = points to the parts of n being used (es)
;   rp1(si) = points to part of doublebignumber used in outer loop (ds)
;   bx = points to part of doublebignumber for carry flag loop     (ds)

        ; be careful, the next dozen or so lines are confusing!

        ; determine whether r is on an odd or even word in the number
        mov     ax, bnlength
        shl     ax, 1                   ; double wide width
        sub     ax, rlength             ; 2*bnlength-rlength
        mov     dx, ax                  ; save this for a moment
        and     ax, 0004h               ; check the odd sign bit

        mov     si, word ptr r          ; load r pointer in ds:si
        add     si, ax                  ; depending on odd or even byte

        shr     dx, 2                   ; assumes dword size
        inc     dx
        and     dx, 0FFFEh              ; ~2+1, turn off last bit, mult of 2
        shl     dx, 1
        mov     di, word ptr n          ; load n1p pointer in di
                                        ; es is still set from before
        add     di, dx

        mov     ax, bnlength
        sub     ax, dx
        shr     ax, 2                   ; 1 byte = 1/4 dword
        mov     i, ax

        shl     ax, 1                   ; double steps
        sub     ax, 2                   ; only 2*s-2 steps are really needed
        mov     doublesteps, ax
        mov     carry_steps, ax

        mov     ds, bignum_seg          ; load ds

top_outer_loop_squares_32:

        mov     eax, ds:[di]
        mul     eax                     ; square it

        mov     bx, si
        add     bx, 4                   ; increase by size of dword
        add     ds:[bx-4], eax          ; add low dword
        adc     ds:[bx], edx            ; add high dword
        jnc     no_more_carry_squares_32 ; carry loop not necessary

        mov     cx, carry_steps         ; how many till end of double big number
        jcxz    no_more_carry_squares_32
        add     bx, 4                   ; move pointer to next dword

        ; loop until no more carry or until end of double big number
top_carry_loop_squares_32:
        add     dword ptr ds:[bx], 1    ; use add, not inc
        jnc     no_more_carry_squares_32
        add     bx, 4                   ; increase by size of dword
        loop    top_carry_loop_squares_32

no_more_carry_squares_32:
        add     di, 4                   ; increase by dword size
        add     si, 8                   ; increase by 2*dword size

        sub     doublesteps,2           ; reduce the carry steps needed
        mov     ax, doublesteps
        mov     carry_steps, ax

        dec     i
        ja      top_outer_loop_squares_32


        ; result is in r
ENDIF

bottom:
.8086

; since it is a square, the result has to already be positive

        mov     ds, save_ds             ; restore ds
        mov     ax, word ptr r          ; return r in ax

quit_proc:
        ret
unsafe_square_bn   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r = n * u  where u is an unsigned integer
mult_bn_int   PROC USES di si, r:bn_t, n:bn_t, u:word
LOCAL   lu:dword  ; long unsigned integer in 32 bit math

        push    ds                      ; save ds
        mov     cx, bnlength
        mov     di, WORD PTR r
        mov     si, WORD PTR n


IFDEF BIG16AND32
        cmp     cpu, 386                ; check cpu
        jae     use_32_bit              ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        mov     ds, bignum_seg          ; load ds

        ; no need to clear r

        shr     cx, 1                   ; byte = 1/2 word
        sub     bx, bx                  ; use bx for temp holding carried word

top_loop_16:
        mov     ax, ds:[si]             ; load next word from n
        mul     u                       ; n * u
        add     ax, bx                  ; add last carried upper word
        adc     dx, 0                   ; inc the carried word if carry flag set
        mov     bx, dx                  ; save high word in bx
        mov     ds:[di], ax             ; save low word

        add     di, 2                   ; next word in r
        add     si, 2                   ; next word in n
        loop    top_loop_16
ENDIF

IFDEF BIG16AND32
        jmp     bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        mov     ds, bignum_seg          ; load ds

        ; no need to clear r

        shr     cx, 2                   ; byte = 1/4 dword
        sub     ebx, ebx                ; use ebx for temp holding carried dword

        sub     eax, eax                ; clear upper eax
        mov     ax, u                   ; convert u (unsigned int)
        mov     lu, eax                 ;   to lu (long unsigned int)

top_loop_32:
        mov     eax, ds:[si]            ; load next dword from n
        mul     lu                      ; n * lu
        add     eax, ebx                ; add last carried upper dword
        adc     edx, 0                  ; inc the carried dword if carry flag set
        mov     ebx, edx                ; save high dword in ebx
        mov     ds:[di], eax            ; save low dword

        add     di, 4                   ; next dword in r
        add     si, 4                   ; next dword in n
        loop    top_loop_32
ENDIF

bottom:
.8086

        pop     ds
        mov     ax, word ptr r          ; return r in ax
        ret
mult_bn_int   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r *= u  where u is an unsigned integer
mult_a_bn_int   PROC USES di si, r:bn_t, u:word

        push    ds                      ; save ds
        mov     cx, bnlength            ; set outer loop counter
        mov     si, WORD PTR r


IFDEF BIG16AND32
        cmp     cpu, 386                ; check cpu
        jae     use_32_bit              ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        mov     ds, bignum_seg          ; load ds
        ; no need to clear r
        shr     cx, 1                   ; byte = 1/2 word
        sub     bx, bx                  ; use bx for temp holding carried word
        mov     di, u                   ; save u in di

top_loop_16:
        mov     ax, ds:[si]             ; load next word from r
        mul     di                      ; r * u
        add     ax, bx                  ; add last carried upper word
        adc     dx, 0                   ; inc the carried word if carry flag set
        mov     bx, dx                  ; save high word in bx
        mov     ds:[si], ax             ; save low word

        add     si, 2                   ; next word in r
        loop    top_loop_16
ENDIF

IFDEF BIG16AND32
        jmp     bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        mov     ds, bignum_seg          ; load ds
        ; no need to clear r
        shr     cx, 2                   ; byte = 1/4 dword
        sub     ebx, ebx                ; use ebx for temp holding carried dword
        sub     edi, edi                ; clear upper edi
        mov     di, u                   ; save u in lower di

top_loop_32:
        mov     eax, ds:[si]            ; load next dword from r
        mul     edi                     ; r * u
        add     eax, ebx                ; add last carried upper dword
        adc     edx, 0                  ; inc the carried dword if carry flag set
        mov     ebx, edx                ; save high dword in ebx
        mov     ds:[si], eax            ; save low dword

        add     si, 4                   ; next dword in r
        loop    top_loop_32
ENDIF

bottom:
.8086

        pop     ds                      ; restore ds
        mov     ax, word ptr r          ; return r in ax
        ret
mult_a_bn_int   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r = n / u  where u is an unsigned integer
unsafe_div_bn_int   PROC USES di si, r:bn_t, n:bn_t, u:word
LOCAL  sign:byte

        push    ds
                                            ; check for sign bits
        mov     bx, WORD PTR n
        mov     es, bignum_seg              ; load n pointer es:bx
        add     bx, bnlength
        mov     al, es:[bx-1]
        and     al, 80h                     ; check the sign bit
        mov     sign, al
        jz      already_pos
        invoke  neg_a_bn, n
already_pos:

        mov     cx, bnlength                ; set outer loop counter
        mov     di, word ptr r
        mov     si, word ptr n              ; load pointers ds:si
        ; past most significant portion of the number
        add     si, cx
        add     di, cx

IFDEF BIG16AND32
        cmp     cpu, 386                ; check cpu
        jae     use_32_bit              ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        mov     ds, bignum_seg          ; load ds

        ; no need to clear r here, values get mov'ed, not add'ed
        shr     cx, 1                   ; byte = 1/2 word
        mov     bx, u

        ; need to start with most significant portion of the number
        sub     si, 2                   ; most sig word
        sub     di, 2                   ; most sig word

        sub     dx, dx                  ; clear dx register
                                        ; for first time through loop
top_loop_16:
        mov     ax, ds:[si]             ; load next word from n
        div     bx
        mov     ds:[di], ax             ; store low word
                                        ; leave remainder in dx

        sub     si, 2                   ; next word in n
        sub     di, 2                   ; next word in r
        loop    top_loop_16
ENDIF

IFDEF BIG16AND32
        jmp     bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        mov     ds, bignum_seg          ; load ds

        ; no need to clear r here, values get mov'ed, not add'ed
        shr     cx, 2                   ; byte = 1/4 dword
        sub     ebx, ebx                ; clear upper word or ebx
        mov     bx, u

        ; need to start with most significant portion of the number
        sub     si, 4                   ; most sig dword
        sub     di, 4                   ; most sig dword

        sub     edx, edx                ; clear edx register
                                        ; for first time through loop
top_loop_32:
        mov     eax, ds:[si]            ; load next dword from n
        div     ebx
        mov     ds:[di], eax            ; store low dword
                                        ; leave remainder in edx

        sub     si, 4                   ; next dword in n
        sub     di, 4                   ; next dword in r
        loop    top_loop_32
ENDIF

bottom:
.8086

        pop     ds                      ; restore ds

        cmp     sign, 0                 ; is result + or - ?
        je      pos_answer              ; yes
        invoke  neg_a_bn, r             ; does not affect ES
pos_answer:

        mov     ax, word ptr r          ; return r in ax
        ret
unsafe_div_bn_int   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r /= u  where u is an unsigned integer
div_a_bn_int   PROC USES si, r:bn_t, u:word
LOCAL  sign:byte

        push    ds

        mov     bx, WORD PTR r
        mov     es, bignum_seg              ; load r pointer es:bx
        add     bx, bnlength
        mov     al, es:[bx-1]
        and     al, 80h                     ; check the sign bit
        mov     sign, al
        jz      already_pos
        invoke  neg_a_bn, r
already_pos:

        mov     cx, bnlength            ; set outer loop counter
        mov     si, WORD PTR r
        ; past most significant portion of the number
        add     si, cx


IFDEF BIG16AND32
        cmp     cpu, 386                ; check cpu
        jae     use_32_bit              ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        mov     ds, bignum_seg          ; load ds

        ; no need to clear r here, values get mov'ed, not add'ed
        shr     cx, 1                   ; byte = 1/2 word
        mov     bx, u

        ; need to start with most significant portion of the number
        sub     si, 2                   ; most sig word

        sub     dx, dx                  ; clear dx register
                                        ; for first time through loop
top_loop_16:
        mov     ax, ds:[si]             ; load next word from r
        div     bx
        mov     ds:[si], ax             ; store low word
                                        ; leave remainder in dx

        sub     si, 2                   ; next word in r
        loop    top_loop_16
ENDIF

IFDEF BIG16AND32
        jmp     bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        mov     ds, bignum_seg          ; load ds

        ; no need to clear r here, values get mov'ed, not add'ed
        shr     cx, 2                   ; byte = 1/4 dword
        sub     ebx, ebx                ; clear upper word or ebx
        mov     bx, u

        ; need to start with most significant portion of the number
        sub     si, 4                   ; most sig dword

        sub     edx, edx                ; clear edx register
                                        ; for first time through loop
top_loop_32:
        mov     eax, ds:[si]            ; load next dword from r
        div     ebx
        mov     ds:[si], eax            ; store low dword
                                        ; leave remainder in edx

        sub     si, 4                   ; next dword in r
        loop    top_loop_32
ENDIF

bottom:
.8086
        pop     ds                      ; restore ds

        cmp     sign, 0                 ; is result + or - ?
        je      pos_answer              ; yes
        invoke  neg_a_bn, r             ; does not affect ES
pos_answer:

        mov     ax, word ptr r          ; return r in ax
        ret
div_a_bn_int   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; bf_t routines
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r = 0    (just like clear_bn() but loads bflength+2 instead of bnlength)
clear_bf   PROC USES di, r:bf_t

        mov     cx, bflength
        mov     di, word ptr r
        mov     es, bignum_seg          ; load pointer in es:di

IFDEF BIG16AND32
        cmp     cpu, 386                ; check cpu
        jae     short use_32_bit        ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        sub     ax, ax                  ; clear ax
        shr     cx, 1                   ; 1 byte = 1/2 word
        inc     cx                      ; plus the exponent
        rep     stosw                   ; clear r, word at a time
ENDIF

IFDEF BIG16AND32
        jmp     bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        sub     eax, eax                ; clear eax
        shr     cx, 2                   ; 1 byte = 1/4 word
        rep     stosd                   ; clear r, dword at a time
        stosw                           ; plus the exponent
ENDIF

bottom:
.8086
        mov     ax, word ptr r          ; return r in ax
        ret

clear_bf   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r = n
copy_bf   PROC USES di si, r:bf_t, n:bf_t

        mov     ax, ds                  ; save ds for later
        mov     cx, bflength
        add     cx, 2
        mov     di, word ptr r
        mov     es, bignum_seg          ; load pointer in es:di
        mov     si, word ptr n

IFDEF BIG16AND32
        cmp     cpu, 386                ; check cpu
        jae     short use_32_bit        ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
        mov     ds, bignum_seg          ; load pointer in ds:si for movs

        shr     cx, 1                   ; 1 byte = 1/2 word
        inc     cx                      ; plus the exponent
        rep     movsw                   ; copy word at a time
ENDIF

IFDEF BIG16AND32
        jmp     bottom
ENDIF

IFDEF BIG32
use_32_bit:
.386
        mov     ds, bignum_seg          ; load pointer in ds:si for movs

        shr     cx, 2                   ; 1 byte = 1/4 word
        rep     movsd                   ; copy dword at a time
        movsw                           ; plus the exponent
ENDIF

bottom:
.8086
        mov     ds, ax                  ; restore ds
        mov     ax, word ptr r          ; return r in ax
        ret

copy_bf   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; LDBL bftofloat(bf_t n);
; converts a bf number to a 10 byte real
;
bftofloat   PROC USES di si, n:bf_t
   LOCAL value[11]:BYTE   ; 11=10+1

      mov      ax, ds                  ; save ds

      mov      cx, 9                   ; need up to 9 bytes
      cmp      bflength, 10            ; but no more than bflength-1
      jae      movebytes_set
      mov      cx, bflength            ; bflength is less than 10
      dec      cx                      ; cx=movebytes=bflength-1, 1 byte padding
movebytes_set:

IFDEF BIG16AND32
      cmp     cpu, 386              ; check cpu
;      jae     use_32_bit            ; use faster 32 bit code if possible
      jb      over_use_32bit
      jmp     use_32_bit            ; use faster 32 bit code if possible
over_use_32bit:
ENDIF

IFDEF BIG16
; 16 bit code
      ; clear value
      mov      word ptr value[0], 0
      mov      word ptr value[2], 0
      mov      word ptr value[4], 0
      mov      word ptr value[6], 0
      mov      word ptr value[8], 0
      mov      byte ptr value[10], 0

      ; copy bytes from n to value
      lea      di, value+9
      sub      di, cx               ; cx holds movebytes
      mov      dx, ss               ; move ss to es for movs
      mov      es, dx               ; ie: move ss:value+9-cx to es:di
      mov      bx, bflength
      dec      bx
      sub      bx, cx               ; cx holds movebytes
      mov      si, word ptr n
      mov      ds, bignum_seg       ; move n to ds:si for movs
      add      si, bx               ; n+bflength-1-movebytes
      rep movsb
      mov      bl, ds:[si]          ; save sign byte, si now points to it
      inc      si                   ; point to exponent
      mov      dx, ds:[si]          ; use dx as exponent
      mov      cl, 3                ; put exponent (dx) in base 2
      shl      dx, cl               ; 256^n = 2^(8n)

      ; adjust for negative values
      and      bl, 10000000b           ; isolate sign bit
      jz       not_neg_16
      neg      word ptr value[0]       ; take the negative of the 9 byte number
      cmc                              ; toggle carry flag
      not      word ptr value[2]
      adc      word ptr value[2], 0
      not      word ptr value[4]
      adc      word ptr value[4], 0
      not      word ptr value[6]
      adc      word ptr value[6], 0
      not      byte ptr value[8]       ; notice this last one is byte ptr
      adc      byte ptr value[8], 0
not_neg_16:

      cmp      byte ptr value[8], 0          ; test for 0
      jnz      top_shift_16
      fldz
      jmp      return

      ; Shift until most signifcant bit is set.
top_shift_16:
      test     byte ptr value[8], 10000000b  ; test msb
      jnz      bottom_shift_16
      dec      dx                      ; decrement exponent
      shl      word ptr value[0], 1    ; shift left the 9 byte number
      rcl      word ptr value[2], 1
      rcl      word ptr value[4], 1
      rcl      word ptr value[6], 1
      rcl      byte ptr value[8], 1    ; notice this last one is byte ptr
      jmp      top_shift_16
bottom_shift_16:

      ; round last byte
      cmp      byte ptr value[0], 80h  ;
;      jb       bottom                  ; no rounding necessary
      jnb      not_bottom1
      jmp      bottom                  ; no rounding necessary
not_bottom1:
      add      word ptr value[1], 1
      adc      word ptr value[3], 0
      adc      word ptr value[5], 0
      adc      word ptr value[7], 0
;      jnc      bottom
      jc       not_bottom2
      jmp      bottom
not_bottom2:
      ; to get to here, the pattern was rounded from +FFFF...
      ; to +10000... with the 1 getting moved to the carry bit
ENDIF

IFDEF BIG16AND32
      jmp      rounded_past_end
ENDIF

IFDEF BIG32
use_32_bit:
.386
      ; clear value
      mov      dword ptr value[0], 0
      mov      dword ptr value[4], 0
      mov      word ptr value[8],  0
      mov      byte ptr value[10], 0

      ; copy bytes from n to value
      lea      di, value+9
      sub      di, cx               ; cx holds movebytes
      mov      dx, ss               ; move ss to es for movs
      mov      es, dx               ; ie: move ss:value+9-cx to es:di
      mov      bx, bflength
      dec      bx
      sub      bx, cx               ; cx holds movebytes
      mov      si, word ptr n
      mov      ds, bignum_seg       ; move n to ds:si for movs
      add      si, bx               ; n+bflength-1-movebytes
      rep movsb
      mov      bl, ds:[si]          ; save sign byte, si now points to it
      inc      si                   ; point to exponent
      mov      dx, ds:[si]          ; use dx as exponent
      shl      dx, 3                ; 256^n = 2^(8n)

      ; adjust for negative values
      and      bl, 10000000b           ; determine sign
      jz       not_neg_32
      neg      dword ptr value[0]      ; take the negative of the 9 byte number
      cmc                              ; toggle carry flag
      not      dword ptr value[4]
      adc      dword ptr value[4], 0
      not      byte ptr value[8]       ; notice this last one is byte ptr
      adc      byte ptr value[8], 0
not_neg_32:

      cmp      byte ptr value[8], 0          ; test for 0
      jnz      top_shift_32
      fldz
      jmp      return

      ; Shift until most signifcant bit is set.
top_shift_32:
      test     byte ptr value[8], 10000000b  ; test msb
      jnz      bottom_shift_32
      dec      dx                      ; decrement exponent
      shl      dword ptr value[0], 1   ; shift left the 9 byte number
      rcl      dword ptr value[4], 1
      rcl      byte ptr value[8], 1    ; notice this last one is byte ptr
      jmp      top_shift_32
bottom_shift_32:

      ; round last byte
      cmp      byte ptr value[0], 80h  ;
      jb       bottom                  ; no rounding necessary
      add      dword ptr value[1], 1
      adc      dword ptr value[5], 0
      jnc      bottom

      ; to get to here, the pattern was rounded from +FFFF...
      ; to +10000... with the 1 getting moved to the carry bit
ENDIF

rounded_past_end:
.8086 ; used in 16 it code as well
      mov      byte ptr value[8], 10000000b
      inc      dx                      ; adjust the exponent

bottom:
      ; adjust exponent
      add      dx, 3FFFh+7             ; unbiased -> biased, + adjusted
      or       dh, bl                  ; set sign bit if set
      mov      word ptr value[9], dx

      ; unlike float and double, long double is returned on fpu stack
      fld      real10 ptr value[1]    ; load return value
return:
      mov      ds, ax                  ; restore ds
      ret

bftofloat   endp

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; LDBL floattobf(bf_t n, LDBL f);
; converts a 10 byte real to a bf number
;
floattobf   PROC USES di si, n:bf_t, f:REAL10
   LOCAL value[9]:BYTE   ; 9=8+1
; I figured out a way to do this with no local variables,
; but it's not worth the extra overhead.

      invoke   clear_bf, n

      ; check to see if f is 0
      cmp      byte ptr f[7], 0        ; f[7] can only be 0 if f is 0
;      jz       return                  ; if f is 0, bailout now
      jnz      over_return
      jmp      return                  ; if f is 0, bailout now
over_return:

      mov      cx, 9                   ; need up to 9 bytes
      cmp      bflength, 10            ; but no more than bflength-1
      jae      movebytes_set
      mov      cx, bflength            ; bflength is less than 10
      dec      cx                      ; movebytes = bflength-1, 1 byte padding
movebytes_set:

IFDEF BIG16AND32
      cmp     cpu, 386              ; check cpu
      jae     use_32_bit            ; use faster 32 bit code if possible
ENDIF

IFDEF BIG16
; 16 bit code
      ; copy bytes from f's mantissa to value
      mov      byte ptr value[0], 0    ; clear least sig byte
      mov      ax, word ptr f[0]
      mov      word ptr value[1], ax
      mov      ax, word ptr f[2]
      mov      word ptr value[3], ax
      mov      ax, word ptr f[4]
      mov      word ptr value[5], ax
      mov      ax, word ptr f[6]
      mov      word ptr value[7], ax

      ; get exponent in dx
      mov      dx, word ptr f[8]       ; location of exponent
      and      dx, 7FFFh               ; remove sign bit
      sub      dx, 3FFFh+7             ; biased -> unbiased, + adjust

      ; Shift down until exponent is a mult of 8 (2^8n=256n)
top_shift_16:
      test     dx, 111b                ; expon mod 8
      jz       bottom
      inc      dx                      ; increment exponent
      shr      word ptr value[7], 1    ; shift right the 9 byte number
      rcr      word ptr value[5], 1
      rcr      word ptr value[3], 1
      rcr      word ptr value[1], 1
      rcr      byte ptr value[0], 1    ; notice this last one is byte ptr
      jmp      top_shift_16
ENDIF

IFDEF BIG32
use_32_bit:
.386
      ; copy bytes from f's mantissa to value
      mov      byte ptr value[0], 0    ; clear least sig byte
      mov      eax, dword ptr f[0]
      mov      dword ptr value[1], eax
      mov      eax, dword ptr f[4]
      mov      dword ptr value[5], eax

      ; get exponent in dx
      mov      dx, word ptr f[8]       ; location of exponent
      and      dx, 7FFFh               ; remove sign bit
      sub      dx, 3FFFh+7             ; biased -> unbiased, + adjust

      ; Shift down until exponent is a mult of 8 (2^8n=256n)
top_shift_32:
      test     dx, 111b                ; expon mod 8
      jz       bottom
      inc      dx                      ; increment exponent
      shr      dword ptr value[5], 1   ; shift right the 9 byte number
      rcr      dword ptr value[1], 1
      rcr      byte ptr value[0], 1    ; notice this last one is byte ptr
      jmp      top_shift_32
ENDIF

bottom:
.8086
      ; Don't bother rounding last byte as it would only make a difference
      ; when bflength < 9, and then only on the last bit.

      ; move data into place, from value to n
      lea      si, value+9
      sub      si, cx               ; cx holds movebytes
      mov      ax, ds               ; save ds
      mov      bx, ss               ; copy ss to ds for movs
      mov      ds, bx               ; ds:si
      mov      di, word ptr n
      mov      es, bignum_seg       ; move n to es:di for movs
      add      di, bflength
      dec      di
      sub      di, cx               ; cx holds movebytes
      rep movsb
      inc      di
      mov      cl, 3
      sar      dx, cl               ; divide expon by 8, 256^n=2^8n
      mov      word ptr es:[di], dx ; store exponent
      mov      ds, ax               ; restore ds

      ; get sign
      test     byte ptr f[9], 10000000b           ; test sign bit
      jz       not_negative
      invoke   neg_a_bf, n
not_negative:
return:
      mov      ax, word ptr n
      mov      dx, word ptr n+2        ; return r in dx:ax
      ret
floattobf   endp

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; LDBL bntofloat(bf_t n);
; converts a bn number to a 10 byte real
; (the most speed critical of these to/from float routines)
bntofloat   PROC USES di si, n:bn_t
   LOCAL value[11]:BYTE   ; 11=10+1

      ; determine the most significant byte, not 0 or FF
      mov      si, word ptr n
      mov      es, bignum_seg
      dec      si
      add      si, bnlength            ; n+bnlength-1
      mov      bl, es:[si]             ; top byte
      mov      cx, bnlength            ; initialize cx with full bnlength
      cmp      bl, 0                   ; test top byte against 0
      je       determine_sig_bytes
      cmp      bl, 0FFh                ; test top byte against -1
      jne      sig_bytes_determined

determine_sig_bytes:
      dec      cx                      ; now bnlength-1
top_sig_byte:
      dec      si                      ; previous byte
      cmp      es:[si], bl             ; does it have the right stuff?
      jne      sig_bytes_determined    ; (ie: does it match top byte?)
      loop     top_sig_byte            ; decrement cx and repeat

; At this point, it must be 0 with no sig figs at all
; or -1/(256^bnlength), one bit away from being zero.
      cmp      bl, 0                   ; was it zero?
      jnz      not_zero                ; no, it was a very small negative
                                       ; yes
      fldz                             ; return zero
      jmp      return
not_zero:
      mov      ax, intlength
      sub      ax, bnlength
      mov      cl, 3
      shl      ax, cl                  ; 256^n=2^8n, now more like movebits
      add      ax, 3FFFh+0             ; bias, no adjustment necessary
      or       ah, 10000000b           ; turn on sign flag
      mov      word ptr value[9], ax   ; store exponent
      mov      word ptr value[7], 8000h ; store mantissa of 1 in most sig bit
      ; clear rest of value that is actually used
      mov      word ptr value[1], 0
      mov      word ptr value[3], 0
      mov      word ptr value[5], 0

      fld      real10 ptr value[1]
      jmp      return

sig_bytes_determined:
      mov      dx, cx               ; save in dx for later
      cmp      cx, 9-1              ; no more than cx bytes
      jb       set_movebytes
      mov      cx, 9-1              ; up to 8 bytes
set_movebytes:                      ; cx now holds movebytes
                                    ; si still points to most non-0 sig byte
      sub      si, cx               ; si now points to first byte to be moved
      inc      cx                   ; can be up to 9

IFDEF BIG16AND32
      cmp     cpu, 386              ; check cpu
;      jae     use_32_bit            ; use faster 32 bit code if possible
      jb      not_use_32_bit
      jmp     use_32_bit            ; use faster 32 bit code if possible
not_use_32_bit:
ENDIF

IFDEF BIG16
; 16 bit code
      ; clear value
      mov      word ptr value[0], 0
      mov      word ptr value[2], 0
      mov      word ptr value[4], 0
      mov      word ptr value[6], 0
      mov      word ptr value[8], 0
      mov      byte ptr value[10], 0

      ; copy bytes from n to value  ; es:si still holds first move byte of n
      lea      di, value+9
      sub      di, cx               ; cx holds movebytes
      mov      ax, ss               ; move ss to es
      mov      es, ax               ; value[9] is in es:di
      mov      ax, ds               ; save ds
      mov      ds, bignum_seg       ; first move byte of n is now in ds:si
      rep movsb
      mov      ds, ax               ; restore ds

      ; adjust for negative values
      xor      ax, ax                  ; use ax as a flag
      ; get sign flag                  ; top byte is still in bl
      and      bl, 10000000b           ; isolate the sign bit
      jz       not_neg_16
      neg      word ptr value[0]       ; take the negative of the 9 byte number
      cmc                              ; toggle carry flag
      not      word ptr value[2]
      adc      word ptr value[2], 0
      not      word ptr value[4]
      adc      word ptr value[4], 0
      not      word ptr value[6]
      adc      word ptr value[6], 0
      not      byte ptr value[8]       ; notice this last one is byte ptr
      adc      byte ptr value[8], 0
      jnc      not_neg_16              ; normal
      mov      byte ptr value[8], 10000000b    ;n was FFFF...0000...
      inc      ax                      ; set ax to 1 to flag this special case

not_neg_16:
      sub      dx, bnlength            ; adjust exponent
      add      dx, intlength           ; adjust exponent
      mov      cl, 3
      shl      dx, cl                  ; 256^n=2^8n
      add      dx, ax                  ; see special case above
      ; Shift until most signifcant bit is set.
top_shift_16:
      test     byte ptr value[8], 10000000b  ; test msb
;      jnz      bottom
      jz       over_bottom
      jmp      bottom
over_bottom:
      dec      dx                      ; decrement exponent
      shl      word ptr value[0], 1    ; shift left the 9 byte number
      rcl      word ptr value[2], 1
      rcl      word ptr value[4], 1
      rcl      word ptr value[6], 1
      rcl      byte ptr value[8], 1    ; notice this last one is byte ptr
      jmp      top_shift_16

; don't bother rounding, not really needed while speed is.
ENDIF

IFDEF BIG32
use_32_bit:
.386
      ; clear value
      mov      dword ptr value[0], 0
      mov      dword ptr value[4], 0
      mov      word ptr value[8],  0
      mov      byte ptr value[10], 0

      ; copy bytes from n to value  ; es:si still holds first move byte of n
      lea      di, value+9
      sub      di, cx               ; cx holds movebytes
      mov      ax, ss               ; move ss to es
      mov      es, ax               ; value[9] is in es:di
      mov      ax, ds               ; save ds
      mov      ds, bignum_seg       ; first move byte of n is now in ds:si
      rep movsb
      mov      ds, ax               ; restore ds

      ; adjust for negative values
      xor      ax, ax                  ; use ax as a flag
      ; get sign flag                  ; top byte is still in bl
      and      bl, 10000000b           ; determine sign
      jz       not_neg_32
      neg      dword ptr value[0]      ; take the negative of the 9 byte number
      cmc                              ; toggle carry flag
      not      dword ptr value[4]
      adc      dword ptr value[4], 0
      not      byte ptr value[8]       ; notice this last one is byte ptr
      adc      byte ptr value[8], 0
      jnc      not_neg_32              ; normal
      mov      byte ptr value[8], 10000000b    ;n was FFFF...0000...
      inc      ax                      ; set ax to 1 to flag this special case

not_neg_32:
      sub      dx, bnlength            ; adjust exponent
      add      dx, intlength           ; adjust exponent
      shl      dx, 3                   ; 256^n=2^8n
      add      dx, ax                  ; see special case above
      ; Shift until most signifcant bit is set.
top_shift_32:
      test     byte ptr value[8], 10000000b  ; test msb
      jnz      bottom
      dec      dx                      ; decrement exponent
      shl      dword ptr value[0], 1   ; shift left the 9 byte number
      rcl      dword ptr value[4], 1
      rcl      byte ptr value[8], 1    ; notice this last one is byte ptr
      jmp      top_shift_32

; don't bother rounding, not really needed while speed is.
ENDIF

bottom:
.8086
      ; adjust exponent
      add      dx, 3FFFh+7-8           ; unbiased -> biased, + adjusted
      or       dh, bl                  ; set sign bit if set
      mov      word ptr value[9], dx

      ; unlike float and double, long double is returned on fpu stack
      fld      real10 ptr value[1]    ; load return value
return:
      ret

bntofloat   endp

;
; LDBL floattobn(bf_t n, LDBL f) is in BIGNUM.C
;

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; These last two functions do not use bignum type numbers, but take
; long doubles as arguments.  These routines are called by the C code.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; LDBL extract_256(LDBL f, int *exp_ptr)
;
; extracts the mantissa and exponant of f
; finds m and n such that 1<=|m|<256 and f = m*256^n
; n is stored in *exp_ptr and m is returned, sort of like frexp()

extract_256   PROC f:real10, exp_ptr: ptr sword
local  expon:sword, exf:real10, tmp_word:word

        fld     f               ; f
        ftst                    ; test for zero
        fstsw   tmp_word
        fwait
        mov     ax,tmp_word
        sahf
        jnz     not_zero        ; proceed

        mov     bx, exp_ptr
        mov     word ptr [bx], 0    ; save = in *exp_ptr
        jmp     bottom          ; f, which is zero, is already on stack

not_zero:

; since a key fpu operation, fxtract, is not emulated by the MS floating
; point library, separate code is included under use_emul:
        cmp     fpu, 0
        je      use_emul

                                ; f is already on stack
        fxtract                 ; mant exp, where f=mant*2^exp
        fxch                    ; exp mant
        fistp   expon           ; mant
        fwait
        mov     ax, expon
        mov     dx, ax          ; make copy for later use

        cmp     ax, 0           ;
        jge     pos_exp         ; jump if exp >= 0

                                ; exp is neg, adjust exp
        add     ax, 8           ; exp+8

pos_exp:
; adjust mantissa
        and     ax, 7           ; ax mod 8
        jz      adjust_exponent ; don't bother with zero adjustments
        mov     expon, ax       ; use expon as a temp var
        fild    expon           ; exp mant

        fxch                    ; mant exp
        fscale                  ; mant*2^exp exp
        fstp    st(1)           ; mant*2^exp (store in 1 and pop)

adjust_exponent:
        mov     cl, 3
        sar     dx, cl          ; exp / 8
        mov     bx, exp_ptr
        mov     [bx], dx        ; save in *exp_ptr

        fwait
        jmp     bottom


use_emul:
; emulate above code by direct manipulation of 80 bit floating point format
                                    ; f is already on stack
        fstp    exf

        mov     ax, word ptr exf+8  ; get word with the exponent in it
        mov     dx, ax              ; make copy for later use

        and     dx, 8000h           ; keep just the sign bit
        or      dx, 3FFFh           ; 1<=f<2

        and     ax, 7FFFh           ; throw away the sign bit
        sub     ax, 3FFFh           ; unbiased -> biased
        mov     bx, ax
        cmp     bx, 0
        jge     pos_exp_emul
        add     bx, 8               ; adjust negative exponent
pos_exp_emul:
        and     bx, 7               ; bx mod 8
        add     dx, bx
        mov     word ptr exf+8, dx  ; put back word with the exponent in it

        mov     cl, 3
        sar     ax, cl              ; div by 8,  2^(8n) = 256^n
        mov     bx, exp_ptr
        mov     [bx], ax            ; save in *exp_ptr

        fld     exf                 ; for return value

bottom:
        ; unlike float and double, long double is returned on fpu stack
        ret
extract_256   ENDP

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; LDBL scale_256( LDBL f, int n );
; calculates and returns the value of f*256^n
; sort of like ldexp()
;
; n must be in the range -2^12 <= n < 2^12 (2^12=4096),
; which should not be a problem

scale_256   PROC f:real10, n: sword

        cmp     n, 0
        jne     non_zero
        fld     f
        jmp     bottom          ; don't bother with scales of zero

non_zero:
        mov     cl, 3
        shl     n, cl           ; 8n
        fild    n               ; 8n
        fld     f               ; f 8n
; the fscale range limits for 8087/287 processors won't be a problem here
        fscale                  ; new_f=f*2^(8n)=f*256^n  8n
        fstp    st(1)           ; new_f

bottom:
        ; unlike float and double, long double is returned on fpu stack
        ret
scale_256   ENDP

END