32x32 bit multiply

The ICC12 library doesn't supply a 64 bit result for it's long multiply.   This routine is an extension of the existing 32x32 library function and requires the user to define a uint64 type.  For a 9S12 running at 24MHz the function executes in 5.5usec. Typical Usage:

typedef union { 
  struct { 
    unsigned long h;
    unsigned long l; 
  } l; 
  struct { 
    unsigned short hh; 
    unsigned short hl; 
    unsigned short lh; 
    unsigned short ll; 
  } w; 
} uint64; 
unsigned long n1 = 0x76543210; 
unsigned long n2 = 0xFEDCBA98; 
uint64 n3; 
mult64(&n1, &n2, &n3); /* Result in n3 is 0x75CD9046541D5980 */ 

 

_mult64::
; ICC12V7 stack:
; n3 -> 4,sp
; n2 -> 2,sp
; n1 -> D
; n1=A:B n2=C:D n3=i:j:k:l
pshd ;[2] push pointer to n1 onto stack (icc12 passes 1st arg via D)
ldx 0,s ;[3] load address of n1
movw 2,x,2,-s ;[5] push lower word of n1 onto stack
movw 0,x,2,-s ;[5] push upper word of n1 onto stack
ldx 8,s ;[3] load address of n2
movw 2,x,2,-s ;[5] push lower word of n2 onto stack
movw 0,x,2,-s ;[5] push upper word of n2 onto stack
ldx 14,s ;[3] load address of n3
 
ldd 6,s ;[3] load lower word of n1 into D
ldy 2,s ;[3] load lower word of n2 into Y
emul ;[3] B * D
sty 4,x ;[2] store upper word of result to n3 k word
std 6,x ;[2] store lower word of result to n3 l word
 
ldd 4,s ;[3] load upper word of n1 into D
ldy 0,s ;[3] load upper word of n2 into Y
emul ;[3] A * C
sty 0,x ;[2] store upper word of result to n3 i word
std 2,x ;[2] store lower word of result to n3 j word
 
ldd 4,s ;[3] load upper word of n1 into D
ldy 2,s ;[3] load lower word of n2 into Y
emul ;[3] A * D
addd 4,x ;[3] add lower word of result to n3 k word
std 4,x ;[2] store sum to n3 k word
exg Y,D ;[1] put Y (upper word of result) into D
adcb 3,x ;[3] add with carry lower byte of n3 j word
adca 2,x ;[3] add with carry upper byte of n3 j word
std 2,x ;[2] store sum to n3 j word
ldd 0,x ;[3] load n3 i word into D
adcb #0 ;[3] add carry bit
adca #0 ;[3] add carry bit
std 0,x ;[2] store sum of carry bits to n3 i word
 
ldd 6,s ;[3] load lower word of n1 into D
ldy 0,s ;[3] load upper word of n2 into Y
emul ;[3] B * C
addd 4,x ;[3] add lower word of result to n3 k word
std 4,x ;[2] store sum to n3 k word
exg Y,D ;[1] put Y (upper word of result) into D
adcb 3,x ;[3] add with carry lower byte of n3 j word
adca 2,x ;[3] add with carry upper byte of n3 j word
std 2,x ;[2] store sum to n3 j word
ldd 0,x ;[3] load n3 i word into D
adcb #0 ;[3] add carry bit
adca #0 ;[3] add carry bit
std 0,x ;[2] store sum of carry bits to n3 i word
 
leas 10,s ;[2] restore stack pointer
rts ;[5]
  ;[132 cycles]