Volume 1 Basic Architecture (794100), страница 94
Текст из файла (страница 94)
1 E-25GUIDELINES FOR WRITING SIMD FLOATING-POINT EXCEPTION HANDLERSswitch (exc_env->rounding_mode) {case ROUND_TO_NEAREST:cw = 0x003f; // round to nearest, single precision, exceptions maskedbreak;case ROUND_DOWN:cw = 0x043f; // round down, single precision, exceptions maskedbreak;case ROUND_UP:cw = 0x083f; // round up, single precision, exceptions maskedbreak;case ROUND_TO_ZERO:cw = 0x0c3f; // round to zero, single precision, exceptions maskedbreak;default:;}__asm {fldcw WORD PTR cw;}// compute result and round to the destination precision, with// "unbounded" exponent (first IEEE rounding)switch (exc_env->operation) {case ADDPS:case ADDSS:// perform the addition__asm {fnclex;// load input operandsfld DWORD PTR uiopd1; // may set denormal or invalid status flagsfld DWORD PTR uiopd2; // may set denormal or invalid status flagsfaddp st(1), st(0); // may set inexact or invalid status flags// store resultfstp QWORD PTR dbl_res24; // exact}break;case SUBPS:case SUBSS:// perform the subtraction__asm {fnclex;// load input operandsfld DWORD PTR uiopd1; // may set denormal or invalid status flagsfld DWORD PTR uiopd2; // may set denormal or invalid status flagsfsubp st(1), st(0); // may set the inexact or invalid status flags// store resultfstp QWORD PTR dbl_res24; // exact}break;E-26 Vol.
1GUIDELINES FOR WRITING SIMD FLOATING-POINT EXCEPTION HANDLERScase MULPS:case MULSS:// perform the multiplication__asm {fnclex;// load input operandsfld DWORD PTR uiopd1; // may set denormal or invalid status flagsfld DWORD PTR uiopd2; // may set denormal or invalid status flagsfmulp st(1), st(0); // may set inexact or invalid status flags// store resultfstp QWORD PTR dbl_res24; // exact}break;case DIVPS:case DIVSS:// perform the division__asm {fnclex;// load input operandsfld DWORD PTR uiopd1; // may set denormal or invalid status flagsfld DWORD PTR uiopd2; // may set denormal or invalid status flagsfdivp st(1), st(0); // may set the inexact, divide by zero, or// invalid status flags// store resultfstp QWORD PTR dbl_res24; // exact}break;default:; // will never occur}// read status word__asm {fstsw WORD PTR sw;}if (sw & ZERODIVIDE_MASK)sw = sw & ~DENORMAL_MASK; // clear D flag for (denormal / 0)// if invalid flag is set, and invalid exceptions are enabled, take trapif (!(exc_env->exc_masks & INVALID_MASK) && (sw & INVALID_MASK)) {exc_env->status_flag_invalid_operation = 1;exc_env->exception_cause = INVALID_OPERATION;return (RAISE_EXCEPTION);}// checking for NaN operands has priority over denormal exceptions;Vol.
1 E-27GUIDELINES FOR WRITING SIMD FLOATING-POINT EXCEPTION HANDLERS//////ifalso fix for the SSE and SSE2differences in treating two NaN inputs between theinstructions and other IA-32 instructions(isnanf (uiopd1) || isnanf (uiopd2)) {if (isnanf (uiopd1) && isnanf (uiopd2))exc_env->result_fval = quietf (uiopd1);elseexc_env->result_fval = (float)dbl_res24; // exactif (sw & INVALID_MASK) exc_env->status_flag_invalid_operation = 1;return (DO_NOT_RAISE_EXCEPTION);}// if denormal flag set, and denormal exceptions are enabled, take trapif (!(exc_env->exc_masks & DENORMAL_MASK) && (sw & DENORMAL_MASK)) {exc_env->status_flag_denormal_operand = 1;exc_env->exception_cause = DENORMAL_OPERAND;return (RAISE_EXCEPTION);}// if divide by zero flag set, and divide by zero exceptions are// enabled, take trap (for divide only)if (!(exc_env->exc_masks & ZERODIVIDE_MASK) && (sw & ZERODIVIDE_MASK)) {exc_env->status_flag_divide_by_zero = 1;exc_env->exception_cause = DIVIDE_BY_ZERO;return (RAISE_EXCEPTION);}// done if the result is a NaN (QNaN Indefinite)res = (float)dbl_res24;if (isnanf (*(unsigned int *)&res)) {exc_env->result_fval = res; // exactexc_env->status_flag_invalid_operation = 1;return (DO_NOT_RAISE_EXCEPTION);}// dbl_res24 is not a NaN at this pointif (sw & DENORMAL_MASK) exc_env->status_flag_denormal_operand = 1;// Note: (dbl_res24 == 0.0 && sw & PRECISION_MASK) cannot occurif (-MIN_SINGLE_NORMAL < dbl_res24 && dbl_res24 < 0.0 ||0.0 < dbl_res24 && dbl_res24 < MIN_SINGLE_NORMAL) {result_tiny = 1;}// check if the result is hugeif (NEGINFF < dbl_res24 && dbl_res24 < -MAX_SINGLE_NORMAL ||MAX_SINGLE_NORMAL < dbl_res24 && dbl_res24 < POSINFF) {result_huge = 1;}E-28 Vol.
1GUIDELINES FOR WRITING SIMD FLOATING-POINT EXCEPTION HANDLERS////////////////////////at this point, there are no enabled I,D, or Z exceptionsto take; the instr.might lead to an enabled underflow, enabled underflow and inexact,enabled overflow, enabled overflow and inexact, enabled inexact, ornone of these; if there are no U or O enabled exceptions, re-executethe instruction using IA-32 double precision format, and theuser's rounding mode; exceptions must havebeen disabled before callingthis function; an inexact exception may be reported on the 53-bitfsubp, fmulp, or on both the 53-bit and 24-bit conversions, while anoverflow or underflow (with traps disabled) may be reported on theconversion from dbl_res to res////////check whether there is an underflow, overflow,or inexact trap to be takenif the underflow traps are enabled and the result istiny, take underflow trapif (!(exc_env->exc_masks & UNDERFLOW_MASK) && result_tiny) {dbl_res24 = TWO_TO_192 * dbl_res24; // exactexc_env->status_flag_underflow = 1;exc_env->exception_cause = UNDERFLOW;exc_env->result_fval = (float)dbl_res24; // exactif (sw & PRECISION_MASK) exc_env->status_flag_inexact = 1;return (RAISE_EXCEPTION);}// if overflow traps are enabled and the result is huge, take// overflow trapif (!(exc_env->exc_masks & OVERFLOW_MASK) && result_huge) {dbl_res24 = TWO_TO_M192 * dbl_res24; // exactexc_env->status_flag_overflow = 1;exc_env->exception_cause = OVERFLOW;exc_env->result_fval = (float)dbl_res24; // exactif (sw & PRECISION_MASK) exc_env->status_flag_inexact = 1;return (RAISE_EXCEPTION);}// set control word with rounding mode set to exc_env->rounding_mode,// double precision, and all exceptions disabledcw = cw | 0x0200; // set precision to double__asm {fldcw WORD PTR cw;}switch (exc_env->operation) {case ADDPS:case ADDSS:// perform the addition__asm {Vol.
1 E-29GUIDELINES FOR WRITING SIMD FLOATING-POINT EXCEPTION HANDLERS// load input operandsfld DWORD PTR uiopd1; // may set the denormal status flagfld DWORD PTR uiopd2; // may set the denormal status flagfaddp st(1), st(0); // rounded to 53 bits, may set the inexact// status flag// store resultfstp QWORD PTR dbl_res; // exact, will not set any flag}break;case SUBPS:case SUBSS:// perform the subtraction__asm {// load input operandsfld DWORD PTR uiopd1; //fld DWORD PTR uiopd2; //fsubp st(1), st(0);////// store resultfstp QWORD PTR dbl_res;}break;may set the denormal status flagmay set the denormal status flagrounded to 53 bits, may set the inexactstatus flag// exact, will not set any flagcase MULPS:case MULSS:// perform the multiplication__asm {// load input operandsfld DWORD PTR uiopd1; // may set the denormal status flagfld DWORD PTR uiopd2; // may set the denormal status flagfmulp st(1), st(0);// rounded to 53 bits, exact// store resultfstp QWORD PTR dbl_res; // exact, will not set any flag}break;case DIVPS:case DIVSS:// perform the division__asm {// load input operandsfld DWORD PTR uiopd1; // may set the denormal status flagfld DWORD PTR uiopd2; // may set the denormal status flagfdivp st(1), st(0);// rounded to 53 bits, may set the inexact// status flag// store resultfstp QWORD PTR dbl_res; // exact, will not set any flag}break;default:E-30 Vol.
1GUIDELINES FOR WRITING SIMD FLOATING-POINT EXCEPTION HANDLERS; // will never occur}// calculate result for the case an inexact trap has to be taken, or// when no trap occurs (second IEEE rounding)res = (float)dbl_res;// may set P, U or O; may also involve denormalizing the result// read status word__asm {fstsw WORD PTR sw;}// if inexact traps are enabled and result is inexact, take inexact trapif (!(exc_env->exc_masks & PRECISION_MASK) &&((sw & PRECISION_MASK) || (exc_env->ftz && result_tiny))) {exc_env->status_flag_inexact = 1;exc_env->exception_cause = INEXACT;if (result_tiny) {exc_env->status_flag_underflow = 1;////////ifif ftz = 1 and result is tiny, result = 0.0(no need to check for underflow traps disabled: result tiny andunderflow traps enabled would have caused taking an underflowtrap above)(exc_env->ftz) {if (res > 0.0)res = ZEROF;else if (res < 0.0)res = NZEROF;// else leave res unchanged}}if (result_huge) exc_env->status_flag_overflow = 1;exc_env->result_fval = res;return (RAISE_EXCEPTION);}////////////////////////if it got here, then there is no trap to be taken; the following musthold: ((the MXCSR U exceptions are disabled orthe MXCSR underflow exceptions are enabled and the underflow flag isclear and (the inexact flag is set or the inexact flag is clear andthe 24-bit result with unbounded exponent is not tiny)))and (the MXCSR overflow traps are disabled or the overflow flag isclear) and (the MXCSR inexact traps are disabled or the inexact flagis clear)in this case, the result has to be delivered (the status flags aresticky, so they are all set correctly already)Vol.
1 E-31GUIDELINES FOR WRITING SIMD FLOATING-POINT EXCEPTION HANDLERS// read status word to see if result is inexact__asm {fstsw WORD PTR sw;}if (sw & UNDERFLOW_MASK) exc_env->status_flag_underflow = 1;if (sw & OVERFLOW_MASK) exc_env->status_flag_overflow = 1;if (sw & PRECISION_MASK) exc_env->status_flag_inexact = 1;// if ftz = 1, and result is tiny (underflow traps must be disabled),// result = 0.0if (exc_env->ftz && result_tiny) {if (res > 0.0)res = ZEROF;else if (res < 0.0)res = NZEROF;// else leave res unchangedexc_env->status_flag_inexact = 1;exc_env->status_flag_underflow = 1;}exc_env->result_fval = res;if (sw & ZERODIVIDE_MASK) exc_env->status_flag_divide_by_zero = 1;if (sw & DENORMAL_MASK) exc_env->status_flag_denormal= 1;if (sw & INVALID_MASK) exc_env->status_flag_invalid_operation = 1;return (DO_NOT_RAISE_EXCEPTION);break;case CMPPS:case CMPSS:...break;case COMISS:case UCOMISS:...break;case CVTPI2PS:case CVTSI2SS:...break;case CVTPS2PI:E-32 Vol.
1GUIDELINES FOR WRITING SIMD FLOATING-POINT EXCEPTION HANDLERScase CVTSS2SI:case CVTTPS2PI:case CVTTSS2SI:...break;casecasecasecaseMAXPS:MAXSS:MINPS:MINSS:...break;case SQRTPS:case SQRTSS:...break;...case UNSPEC:...break;default:...}}Vol. 1 E-33GUIDELINES FOR WRITING SIMD FLOATING-POINT EXCEPTION HANDLERSE-34 Vol. 1INDEXNumerics128-bitpacked byte integers data type, 4-11, 11-5packed double-precision floating-pointdata type, 4-11, 11-5packed doubleword integers data type, 4-11packed quadword integers data type, 4-11packed SIMD data types, 4-10packed single-precision floating-pointdata type, 4-11, 10-8packed word integers data type, 4-11, 11-516-bitaddress size, 3-11operand size, 3-11286 processor, 2-132-bitaddress size, 3-11operand size, 3-1164-bitpacked byte integers data type, 4-10, 9-4packed doubleword integers data type, 4-10packed doubleword integers data types, 9-4packed word integers data type, 4-10, 9-464-bit modesub-mode of IA-32e, 3-2address calculation, 3-12address size, 3-25address space, 3-6BOUND instruction, 7-26branch behavior, 6-11byte register limitation, 3-17CALL instruction, 6-12, 7-25canonical address, 3-13CMPS instruction, 7-28CMPXCHG16B instruction, 7-7data types, 7-2DEC instruction, 7-12decimal arithmetic instructions, 7-15default operand and address sizes, 3-2exceptions, 6-19far pointer, 4-8feature list, 2-21GDTR register, 3-6IDTR register, 3-6INC instruction, 7-12instruction pointer, 3-12, 3-24instructions introduced, 5-30interrupts, 6-19introduction, 2-21, 3-2, 7-2IRET instruction, 7-26I/O instructions, 7-29JCC instruction, 6-12, 7-25JCXZ instruction, 6-12, 7-25JMP instruction, 6-12, 7-25LAHF instruction, 7-31LDTR register, 3-6legacy modes, 2-22LODS instruction, 7-28LOOP instruction, 6-12, 7-25memory models, 3-11memory operands, 3-28MMX technology, 9-2MOVS instruction, 7-28MOVSXD instruction, 7-11near pointer, 4-8operand addressing, 3-32operand size, 3-25operands, 3-28POPF instruction, 7-31promoted instructions, 3-2PUSHA, PUSHAD, POPA, POPAD, 7-10PUSHF instruction, 7-31PUSHFD instruction, 7-31real address mode, 3-11register operands, 3-28REP prefix, 7-28RET instruction, 6-12, 7-25REX prefix, 3-2, 3-16, 3-25RFLAGS register, 7-31RIP register, 3-12RIP-relative addressing, 3-24, 3-32SAHF instruction, 7-31SCAS instruction, 7-28segment registers, 3-20segmentation, 3-11, 3-30SSE extensions, 10-4SSE2 extensions, 11-4SSE3 extensions, 12-1SSSE3 extensions, 12-1stack behavior, 6-5STOS instruction, 7-28TR register, 3-6x87 FPU, 8-2See also: IA-32e mode, compatibility mode8086 processor, 2-18088 processor, 2-1AAAA instruction, 7-14AAD instruction, 7-14AAM instruction, 7-14AAS instruction, 7-14AC (alignment check) flag, EFLAGS register, 3-23Access rights, segment descriptor, 6-9, 6-14ADC instruction, 7-12ADD instruction, 7-12Vol.