export .when,.sqrt prec: equ 50 ; This code neither reads nor writes the FPSCR ; It assumes that FPSCR indicates "round to nearest". ; It does not "signal inexact". ; 60 clocks for prec <= 50 ; 68 clocks for 50 < prec <= 57 ; 102 clocks for 57 < prec .sqrt: stfd fp1,-8(SP) ; pun.f = t lwz r4,cx[TC](RTOC) ; r4 is base for constants. lis r7,0x3FF0 ; 0x3FF00000 lwz r6,-8(SP) ; pun.x.hi mfcr r3 ; save rlwinm r5,r6,32-11,23,27 ; w*16 oris r8,r6,0xFFF0 xoris r8,r8,0xC000 ; pun.x.hi with exp set to 3FF stw r8,-8(SP) ; pun.x.hi rlwinm. r8,r6,12,21,31 ; exp add r5,r5,r4 ; -> vx element lfs fp4,4(r5) ; vx[w].pre bc 12,2,smallarg ; asul add r7,r6,r7 ; pun.x.hi + 0x3FF00000 mtcrf 0xFF,r6 ; oe to CR1 lfd fp1,-8(SP) lfs fp3,c0-tz(r4) ; 1. rlwinm r7,r7,32-1,1,11 ; expon adjust for ans. ; if pedantic ; mffs fp5 ; mtfsfi 7,0 ; guard against inexact interrupts ; ; and ensure round to nearest ; stfd fp5,-40(SP) ; preserve caller's state ; endif stw r7,-8(SP) fmsub fp4,fp4,fp1,fp3 ; x lfs fp5,c8-tz(r4) ; 429./32768 [x8] lfs fp6,c6-tz(r4) ; 21./1024 [x6] li r7,0 fmul fp2,fp4,fp4 ; x2 lfs fp8,c7-tz(r4) ; 33./2048 [x7] lfs fp9,c5-tz(r4) ; 7./256 [x5] fmadds fp7,fp5,fp2,fp6 lfs fp12,c4-tz(r4) ; 5./128 [x4] bc 12,0,NegArg ; NB. CR1 still in use fmadds fp10,fp8,fp2,fp9 lfs fp13,c3-tz(r4) ; 1./16 [x3] fmadds fp7,fp7,fp2,fp12 lfs fp5,c2-tz(r4) ; 1./8 [x2] stw r7,-4(SP) ; formed double power of two fmadd fp10,fp10,fp2,fp13 lfs fp12,c1-tz(r4) ; 1./2 [x1] cmpi 0,0,r8,0x7ff ; Infinity exponent? fmadd fp7,fp7,fp2,fp5 fmadd fp10,fp10,fp2,fp12 bc 12,2,WayTooBig fnmsub fp5,fp7,fp2,fp3 ; even terms lfd fp13,8(r5) ; vx[w].post lfd fp7,sQrt2-tz(r4) fmadd fp0,fp4,fp10,fp5 ; whole series lfs fp12,0(r5) ; adj fmul fp0,fp13,fp0 ; r bc 12,11,OddExponent fadd fp1,fp1,fp1 ; holds pun.f value fmul fp0,fp0,fp7 lfd fp7,-8(SP) join: if prec>50 fmsub fp13,fp0,fp0,fp1 ; (r*r-pun.f) if prec>57 fnmsub fp12,fp13,fp12,fp0 ; holds adjusted root, R lfs fp4,tm52-tz(r4) fmsub fp3,fp12,fp12,fp1 ; R*R - pun.f ; abs(root(pun.f) - R) < .5152 * 2^-52 fsubs fp0,fp1,fp1 ; produce 0 as comparand! fmadd fp13,fp4,fp12,fp3 ; (R+2^-53)^2 - pun.f - 2^-106 fnmsub fp5,fp4,fp12,fp3 ; (R-2^-53)^2 - pun.f - 2^-106 fcmpu 1,fp13,fp0 fcmpu 5,fp5,fp0 bc 12,4+0,TooSmall bc 4,20+0,TooBig Safe: else fnmsub fp12,fp13,fp12,fp0 ; holds adjusted root, R ; abs(root(pun.f) - R) < .5152 * 2^-52 endif endif ; with adjustment defeated the average absolute error is ; 0.6 of last bit while worst error is 3.5 times last bit. ; with adjustment average error is 0.25004 times last bit ; and worst case error is .515 times last bit. ; (.500 is best theoretical possible.) ; rms errors are respectivesy .8 & .3 mtcrf 0xFF,r3 ; restore if prec>50 fmul fp1,fp7,fp12 else fmul fp1,fp7,fp0 endif blr OddExponent: fmul fp12,fp7,fp12 ; adj lfd fp7,-8(SP) b join if prec>57 TooBig: fsub fp12,fp12,fp4 b Safe TooSmall: fadd fp12,fp12,fp4 b Safe endif NegArg: lfd fp1,nan1-tz(r4) out: mtcrf 0xFF,r3 ; restore blr resZero: addic. r7,r7,0 ; test pun.x.lo bc 12,2,out ; return for .sqrt(0) b DeNormal smallarg: ; Exponent of arg is 0. lwz r7,-4(SP) ; pun.x.lo rlwinm. r8,r6,0,1,31 ; test pun.x.hi bc 12,2,resZero DeNormal: subi SP,SP,8 mflr r5 stw r5,0(SP) ; Preserve original caller of .sqrt lfs fp5,t60-tz(r4) fmul fp1,fp1,fp5 bl .sqrt lfs fp5,tm30-tz(r4) lwz r5,0(SP) ; Retrieve original caller mtlr r5 fmul fp1,fp1,fp5 addi SP,SP,8 blr WayTooBig: stw r6,-8(SP) ; put it back like it was lfd fp1,-8(SP) ; reconstructed original argument b out .when: mfspr r4,RTCU stw r4,0(r3) mfspr r4,RTCL stw r4,4(r3) mfcr r3 blr toc tc cx[TC],tz csect tx[RO] tz: dc.l 0x3EB39F19,0x3F7C0FC4,0x3FF01FE0,0x26B0EBA8 dc.l 0x3EB0EB96,0x3F74898F,0x3FF05EE6,0x810A0447 dc.l 0x3EAE565C,0x3F6D7307,0x3FF09CFD,0xB018663D dc.l 0x3EABDD46,0x3F66C2B5,0x3FF0DA30,0x46DF0BCB dc.l 0x3EA97E62,0x3F607038,0x3FF11687,0xA9BF7D1E dc.l 0x3EA737F0,0x3F5A7410,0x3FF1520C,0xB9665E7C dc.l 0x3EA50855,0x3F54C77B,0x3FF18CC8,0x21F9ED73 dc.l 0x3EA2EE1D,0x3F4F6475,0x3FF1C6C1,0x676DCBA5 dc.l 0x3EA0E7F5,0x3F4A4588,0x3FF1FFFF,0xFEE00000 dc.l 0x3E9EF4A4,0x3F4565CB,0x3FF2388A,0xA24549E3 dc.l 0x3E9D130E,0x3F40C0C6,0x3FF27067,0xE1508515 dc.l 0x3E9B422C,0x3F3C5265,0x3FF2A79E,0x2E142258 dc.l 0x3E99810C,0x3F381706,0x3FF2DE32,0x9D67F180 dc.l 0x3E97CED0,0x3F340B43,0x3FF3142B,0x11823B72 dc.l 0x3E962AA9,0x3F302C0C,0x3FF3498C,0x89D42842 dc.l 0x3E9493D9,0x3F2C7692,0x3FF37E5B,0xCD0E2CA5 dc.l 0x3E9309AF,0x3F28E842,0x3FF3B29D,0x55AF516A dc.l 0x3E918B87,0x3F257EB5,0x3FF3E655,0xEF25E013 dc.l 0x3E9018C6,0x3F2237C3,0x3FF41989,0x4ECE8E2D dc.l 0x3E8EB0E0,0x3F1F1166,0x3FF44C3B,0x824F7CAD dc.l 0x3E8D534F,0x3F1C09C5,0x3FF47E70,0xADFAA87 dc.l 0x3E8BFF97,0x3F191F1A,0x3FF4B02B,0x54FAD60B dc.l 0x3E8AB544,0x3F164FDE,0x3FF4E16F,0x9B31198B dc.l 0x3E8973E8,0x3F139A86,0x3FF51241,0x312541A3 dc.l 0x3E883B1E,0x3F10FDBD,0x3FF542A2,0x66B811F2 dc.l 0x3E870A87,0x3F0E7835,0x3FF57296,0xA3C13A85 dc.l 0x3E85E1C7,0x3F0C08C4,0x3FF5A220,0x2F014E4E dc.l 0x3E84C08B,0x3F09AE41,0x3FF5D142,0xAD7BC321 dc.l 0x3E83A682,0x3F0767AC,0x3FF5FFFF,0xF2F0000C dc.l 0x3E829362,0x3F053408,0x3FF62E5A,0xD3FABB21 dc.l 0x3E8186E2,0x3F03126F,0x3FF65C55,0x799527C5 dc.l 0x3E8080C1,0x3F010204,0x3FF689F2,0x6D1F5164 sqrt2: dc.l 0x3FF6A09E,0x667F3BCD nan1: dc.l 0x7FF80020,0 c0: dc.l 0x3F800000 ; 1.0000000000000e+00 c1: dc.l 0x3F000000 ; 5.0000000000000e-01 c2: dc.l 0x3E000000 ; 1.2500000000000e-01 c3: dc.l 0x3D800000 ; 6.2500000000000e-02 c4: dc.l 0x3D200000 ; 3.9062500000000e-02 c5: dc.l 0x3CE00000 ; 2.7343750000000e-02 c6: dc.l 0x3CA80000 ; 2.0507812500000e-02 c7: dc.l 0x3C840000 ; 1.6113281250000e-02 c8: dc.l 0x3C568000 ; 1.3092041015625e-02 t60: dc.l 0x5D800000 tm30: dc.l 0x30800000 tm52: dc.l 0x25800000