The ICC12 library doesn't supply a 64 bit result for it's long multiply. This routine is an extension of the existing 32x32 library function and requires the user to define a uint64 type. For a 9S12 running at 24MHz the function executes in 5.5usec.
Typical Usage:
typedef union {
struct {
unsigned long h;
unsigned long l;
} l;
struct {
unsigned short hh;
unsigned short hl;
unsigned short lh;
unsigned short ll;
} w;
} uint64;
unsigned long n1 = 0x76543210;
unsigned long n2 = 0xFEDCBA98;
uint64 n3;
mult64(&n1, &n2, &n3); /* Result in n3 is 0x75CD9046541D5980 */
| _mult64:: | ||
| ; ICC12V7 stack: | ||
| ; n3 -> 4,sp | ||
| ; n2 -> 2,sp | ||
| ; n1 -> D | ||
| ; n1=A:B n2=C:D n3=i:j:k:l | ||
| pshd | ;[2] push pointer to n1 onto stack (icc12 passes 1st arg via D) | |
| ldx | 0,s | ;[3] load address of n1 |
| movw | 2,x,2,-s | ;[5] push lower word of n1 onto stack |
| movw | 0,x,2,-s | ;[5] push upper word of n1 onto stack |
| ldx | 8,s | ;[3] load address of n2 |
| movw | 2,x,2,-s | ;[5] push lower word of n2 onto stack |
| movw | 0,x,2,-s | ;[5] push upper word of n2 onto stack |
| ldx | 14,s | ;[3] load address of n3 |
| ldd | 6,s | ;[3] load lower word of n1 into D |
| ldy | 2,s | ;[3] load lower word of n2 into Y |
| emul | ;[3] B * D | |
| sty | 4,x | ;[2] store upper word of result to n3 k word |
| std | 6,x | ;[2] store lower word of result to n3 l word |
| ldd | 4,s | ;[3] load upper word of n1 into D |
| ldy | 0,s | ;[3] load upper word of n2 into Y |
| emul | ;[3] A * C | |
| sty | 0,x | ;[2] store upper word of result to n3 i word |
| std | 2,x | ;[2] store lower word of result to n3 j word |
| ldd | 4,s | ;[3] load upper word of n1 into D |
| ldy | 2,s | ;[3] load lower word of n2 into Y |
| emul | ;[3] A * D | |
| addd | 4,x | ;[3] add lower word of result to n3 k word |
| std | 4,x | ;[2] store sum to n3 k word |
| exg | Y,D | ;[1] put Y (upper word of result) into D |
| adcb | 3,x | ;[3] add with carry lower byte of n3 j word |
| adca | 2,x | ;[3] add with carry upper byte of n3 j word |
| std | 2,x | ;[2] store sum to n3 j word |
| ldd | 0,x | ;[3] load n3 i word into D |
| adcb | #0 | ;[3] add carry bit |
| adca | #0 | ;[3] add carry bit |
| std | 0,x | ;[2] store sum of carry bits to n3 i word |
| ldd | 6,s | ;[3] load lower word of n1 into D |
| ldy | 0,s | ;[3] load upper word of n2 into Y |
| emul | ;[3] B * C | |
| addd | 4,x | ;[3] add lower word of result to n3 k word |
| std | 4,x | ;[2] store sum to n3 k word |
| exg | Y,D | ;[1] put Y (upper word of result) into D |
| adcb | 3,x | ;[3] add with carry lower byte of n3 j word |
| adca | 2,x | ;[3] add with carry upper byte of n3 j word |
| std | 2,x | ;[2] store sum to n3 j word |
| ldd | 0,x | ;[3] load n3 i word into D |
| adcb | #0 | ;[3] add carry bit |
| adca | #0 | ;[3] add carry bit |
| std | 0,x | ;[2] store sum of carry bits to n3 i word |
| leas | 10,s | ;[2] restore stack pointer |
| rts | ;[5] | |
| ;[132 cycles] | ||