From b9365a9ab8f54ba6516ddb6bc80f4e5d8c419995 Mon Sep 17 00:00:00 2001 From: Klaus Treichel Date: Sun, 13 Apr 2008 17:55:36 +0000 Subject: [PATCH] Add support for more opcodes on x86-64. --- ChangeLog | 14 + jit/jit-gen-x86-64.h | 1009 +++++++++++++++++++++++++++++++++++--- jit/jit-rules-x86-64.c | 516 ++++++++++++++++++- jit/jit-rules-x86-64.ins | 548 ++++++++++++++++++--- 4 files changed, 1960 insertions(+), 127 deletions(-) diff --git a/ChangeLog b/ChangeLog index d2e2f16..cec98a4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -10,6 +10,20 @@ * include/jit/jit-walk.h: use _JIT_ARCH_GET_RETURN_ADDRESS and _JIT_ARCH_GET_CURRENT_RETURN if available. + * jit/jit-gen-x86-64.h: Add additional macros for saving and + restoring the fpu controlword and the mxcsr register. Add + additional SSE conversion macros. Add SSE compare macros. + Add macros for the SSE bit operations on packed values. + Add macros for SSE sqrt and rounding. Add macros for fpu rounding. + + * jit/jit-rules-x86-64.c: Add the dreg register class and functions + to handle rounding and SSE bit opcodes on packed values. + + * jit/jit-rules-x86-64.ins: Add INT_TO_NFLOAT, LONG_TO_NFLOAT, + FLOAT32_TO_NFLOAT, FLOAT64_TO_NFLOAT. + Rewrite NFLOAT_TO_INT and NFLOAT_TO_LONG to use the new functions + in jit-rules-x86-64.c. Add handling of ABS, NEG and float compares. + 2008-03-31 Klaus Treichel * jit/jit-rules-x86.ins: Fix the sign opcode for integers and the diff --git a/jit/jit-gen-x86-64.h b/jit/jit-gen-x86-64.h index 40744d8..2a15ee7 100644 --- a/jit/jit-gen-x86-64.h +++ b/jit/jit-gen-x86-64.h @@ -110,6 +110,28 @@ typedef enum XMM1_DIV = 0x5E } X86_64_XMM1_OP; +/* + * Logical opcodes used with packed single and double precision values. + */ +typedef enum +{ + XMM_ANDP = 0x54, + XMM_ORP = 0x56, + XMM_XORP = 0x57 +} X86_64_XMM_PLOP; + +/* + * Rounding modes for xmm rounding instructions, the mxcsr register and + * the fpu control word. + */ +typedef enum +{ + X86_ROUND_NEAREST = 0x00, /* Round to the nearest integer */ + X86_ROUND_DOWN = 0x01, /* Round towards negative infinity */ + X86_ROUND_UP = 0x02, /* Round towards positive infinity */ + X86_ROUND_ZERO = 0x03 /* Round towards zero (truncate) */ +} X86_64_ROUNDMODE; + /* * Helper union for emmitting 64 bit immediate values. */ @@ -3559,6 +3581,59 @@ typedef union x86_64_memindex_emit((inst), (r), (basereg), (disp), (indexreg), (shift)); \ } while(0) +/* + * xmm instructions with a prefix and three opcodes + */ +#define x86_64_p1_xmm3_reg_reg_size(inst, p1, opc1, opc2, opc3, r, reg, size) \ + do { \ + *(inst)++ = (unsigned char)(p1); \ + x86_64_rex_emit(inst, (size), (r), 0, (reg)); \ + *(inst)++ = (unsigned char)(opc1); \ + *(inst)++ = (unsigned char)(opc2); \ + *(inst)++ = (unsigned char)(opc3); \ + x86_64_reg_emit(inst, (r), (reg)); \ + } while(0) + +#define x86_64_p1_xmm3_reg_regp_size(inst, p1, opc1, opc2, opc3, r, regp, size) \ + do { \ + *(inst)++ = (unsigned char)(p1); \ + x86_64_rex_emit(inst, (size), (r), 0, (regp)); \ + *(inst)++ = (unsigned char)(opc1); \ + *(inst)++ = (unsigned char)(opc2); \ + *(inst)++ = (unsigned char)(opc3); \ + x86_64_regp_emit(inst, (r), (regp)); \ + } while(0) + +#define x86_64_p1_xmm3_reg_mem_size(inst, p1, opc1, opc2, opc3, r, mem, size) \ + do { \ + *(inst)++ = (unsigned char)(p1); \ + x86_64_rex_emit(inst, (size), (r), 0, 0); \ + *(inst)++ = (unsigned char)(opc1); \ + *(inst)++ = (unsigned char)(opc2); \ + *(inst)++ = (unsigned char)(opc3); \ + x86_64_mem_emit(inst, (r), (mem)); \ + } while(0) + +#define x86_64_p1_xmm3_reg_membase_size(inst, p1, opc1, opc2, opc3, r, basereg, disp, size) \ + do { \ + *(inst)++ = (unsigned char)(p1); \ + x86_64_rex_emit(inst, (size), (r), 0, (basereg)); \ + *(inst)++ = (unsigned char)(opc1); \ + *(inst)++ = (unsigned char)(opc2); \ + *(inst)++ = (unsigned char)(opc3); \ + x86_64_membase_emit(inst, (r), (basereg), (disp)); \ + } while(0) + +#define x86_64_p1_xmm3_reg_memindex_size(inst, p1, opc1, opc2, opc3, r, basereg, disp, indexreg, shift, size) \ + do { \ + *(inst)++ = (unsigned char)(p1); \ + x86_64_rex_emit(inst, (size), (r), (indexreg), (basereg)); \ + *(inst)++ = (unsigned char)(opc1); \ + *(inst)++ = (unsigned char)(opc2); \ + *(inst)++ = (unsigned char)(opc3); \ + x86_64_memindex_emit((inst), (r), (basereg), (disp), (indexreg), (shift)); \ + } while(0) + /* * xmm1: Macro for use of the X86_64_XMM1 enum */ @@ -3587,6 +3662,56 @@ typedef union x86_64_p1_xmm2_reg_memindex_size((inst), ((is_double) ? 0xf2 : 0xf3), 0x0f, (opc), (dreg), (basereg), (disp), (indexreg), (shift), 0); \ } while(0) +/* + * Load and store MXCSR register state + */ + +/* + * ldmxcsr: Load MXCSR register + */ +#define x86_64_ldmxcsr_regp(inst, sregp) \ + do { \ + x86_64_xmm2_reg_regp((inst), 0x0f, 0xae, 2, (sregp)); \ + } while(0) + +#define x86_64_ldmxcsr_mem(inst, mem) \ + do { \ + x86_64_xmm2_reg_mem((inst), 0x0f, 0xae, 2, (mem)); \ + } while(0) + +#define x86_64_ldmxcsr_membase(inst, basereg, disp) \ + do { \ + x86_64_xmm2_reg_membase((inst), 0x0f, 0xae, 2, (basereg), (disp)); \ + } while(0) + +#define x86_64_ldmxcsr_memindex(inst, basereg, disp, indexreg, shift) \ + do { \ + x86_64_xmm2_reg_memindex((inst), 0x0f, 0xae, 2, (basereg), (disp), (indexreg), (shift)); \ + } while(0) + +/* + * stmxcsr: Store MXCSR register + */ +#define x86_64_stmxcsr_regp(inst, sregp) \ + do { \ + x86_64_xmm2_reg_regp((inst), 0x0f, 0xae, 3, (sregp)); \ + } while(0) + +#define x86_64_stmxcsr_mem(inst, mem) \ + do { \ + x86_64_xmm2_reg_mem((inst), 0x0f, 0xae, 3, (mem)); \ + } while(0) + +#define x86_64_stmxcsr_membase(inst, basereg, disp) \ + do { \ + x86_64_xmm2_reg_membase((inst), 0x0f, 0xae, 3, (basereg), (disp)); \ + } while(0) + +#define x86_64_stmxcsr_memindex(inst, basereg, disp, indexreg, shift) \ + do { \ + x86_64_xmm2_reg_memindex((inst), 0x0f, 0xae, 3, (basereg), (disp), (indexreg), (shift)); \ + } while(0) + /* * Move instructions */ @@ -3951,6 +4076,66 @@ typedef union x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x2a, (dreg), (basereg), (disp), (indexreg), (shift), (size)); \ } while(0) +/* + * cvtss2si: Convert float32.to a signed integer using the rounding mode + * in the mxcsr register + * The size is the size of the integer value (4 or 8) + */ +#define x86_64_cvtss2si_reg_reg_size(inst, dreg, sxreg, size) \ + do { \ + x86_64_p1_xmm2_reg_reg_size((inst), 0xf3, 0x0f, 0x2d, (dreg), (sxreg), (size)); \ + } while(0) + +#define x86_64_cvtss2si_reg_regp_size(inst, dreg, sregp, size) \ + do { \ + x86_64_p1_xmm2_reg_regp_size((inst), 0xf3, 0x0f, 0x2d, (dreg), (sregp), (size)); \ + } while(0) + +#define x86_64_cvtss2si_reg_mem_size(inst, dreg, mem, size) \ + do { \ + x86_64_p1_xmm2_reg_mem_size((inst), 0xf3, 0x0f, 0x2d, (dreg), (mem), (size)); \ + } while(0) + +#define x86_64_cvtss2si_reg_membase_size(inst, dreg, basereg, disp, size) \ + do { \ + x86_64_p1_xmm2_reg_membase_size((inst), 0xf3, 0x0f, 0x2d, (dreg), (basereg), (disp), (size)); \ + } while(0) + +#define x86_64_cvtss2si_reg_memindex_size(inst, dreg, basereg, disp, indexreg, shift, size) \ + do { \ + x86_64_p1_xmm2_reg_memindex_size((inst), 0xf3, 0x0f, 0x2d, (dreg), (basereg), (disp), (indexreg), (shift), (size)); \ + } while(0) + +/* + * cvtsd2si: Convert float64 to a signed integer using the rounding mode + * in the mxcsr register + * The size is the size of the integer value (4 or 8) + */ +#define x86_64_cvtsd2si_reg_reg_size(inst, dreg, sxreg, size) \ + do { \ + x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x2d, (dreg), (sxreg), (size)); \ + } while(0) + +#define x86_64_cvtsd2si_reg_regp_size(inst, dreg, sregp, size) \ + do { \ + x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x2d, (dreg), (sregp), (size)); \ + } while(0) + +#define x86_64_cvtsd2si_reg_mem_size(inst, dreg, mem, size) \ + do { \ + x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x2d, (dreg), (mem), (size)); \ + } while(0) + +#define x86_64_cvtsd2si_reg_membase_size(inst, dreg, basereg, disp, size) \ + do { \ + x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x2d, (dreg), (basereg), (disp), (size)); \ + } while(0) + +#define x86_64_cvtsd2si_reg_memindex_size(inst, dreg, basereg, disp, indexreg, shift, size) \ + do { \ + x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x2d, (dreg), (basereg), (disp), (indexreg), (shift), (size)); \ + } while(0) + /* * cvtss2sd: Convert float32 to float64 */ @@ -4007,6 +4192,122 @@ typedef union x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x5a, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ } while(0) +/* + * Compare opcodes + */ + +/* + * comiss: Compare ordered scalar single precision values + */ +#define x86_64_comiss_reg_reg(inst, dreg, sreg) \ + do { \ + x86_64_xmm2_reg_reg((inst), 0x0f, 0x2f, (dreg), (sreg)); \ + } while(0) + +#define x86_64_comiss_reg_regp(inst, dreg, sregp) \ + do { \ + x86_64_xmm2_reg_regp((inst), 0x0f, 0x2f, (dreg), (sregp)); \ + } while(0) + +#define x86_64_comiss_reg_mem(inst, dreg, mem) \ + do { \ + x86_64_xmm2_reg_mem((inst), 0x0f, 0x2f, (dreg), (mem)); \ + } while(0) + +#define x86_64_comiss_reg_membase(inst, dreg, basereg, disp) \ + do { \ + x86_64_xmm2_reg_membase((inst), 0x0f, 0x2f, (dreg), (basereg), (disp)); \ + } while(0) + +#define x86_64_comiss_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ + do { \ + x86_64_xmm2_reg_memindex((inst), 0x0f, 0x2f, (dreg), (basereg), (disp), (indexreg), (shift)); \ + } while(0) + +/* + * comisd: Compare ordered scalar double precision values + */ +#define x86_64_comisd_reg_reg(inst, dreg, sreg) \ + do { \ + x86_64_p1_xmm2_reg_reg_size((inst), 0x66, 0x0f, 0x2f, (dreg), (sreg), 0); \ + } while(0) + +#define x86_64_comisd_reg_regp(inst, dreg, sregp) \ + do { \ + x86_64_p1_xmm2_reg_regp_size((inst), 0x66, 0x0f, 0x2f, (dreg), (sregp), 0); \ + } while(0) + +#define x86_64_comisd_reg_mem(inst, dreg, mem) \ + do { \ + x86_64_p1_xmm2_reg_mem_size((inst), 0x66, 0x0f, 0x2f, (dreg), (mem), 0); \ + } while(0) + +#define x86_64_comisd_reg_membase(inst, dreg, basereg, disp) \ + do { \ + x86_64_p1_xmm2_reg_membase_size((inst), 0x66, 0x0f, 0x2f, (dreg), (basereg), (disp), 0); \ + } while(0) + +#define x86_64_comisd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ + do { \ + x86_64_p1_xmm2_reg_memindex_size((inst), 0x66, 0x0f, 0x2f, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + } while(0) + +/* + * ucomiss: Compare unordered scalar single precision values + */ +#define x86_64_ucomiss_reg_reg(inst, dreg, sreg) \ + do { \ + x86_64_xmm2_reg_reg((inst), 0x0f, 0x2e, (dreg), (sreg)); \ + } while(0) + +#define x86_64_ucomiss_reg_regp(inst, dreg, sregp) \ + do { \ + x86_64_xmm2_reg_regp((inst), 0x0f, 0x2e, (dreg), (sregp)); \ + } while(0) + +#define x86_64_ucomiss_reg_mem(inst, dreg, mem) \ + do { \ + x86_64_xmm2_reg_mem((inst), 0x0f, 0x2e, (dreg), (mem)); \ + } while(0) + +#define x86_64_ucomiss_reg_membase(inst, dreg, basereg, disp) \ + do { \ + x86_64_xmm2_reg_membase((inst), 0x0f, 0x2e, (dreg), (basereg), (disp)); \ + } while(0) + +#define x86_64_ucomiss_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ + do { \ + x86_64_xmm2_reg_memindex((inst), 0x0f, 0x2e, (dreg), (basereg), (disp), (indexreg), (shift)); \ + } while(0) + +/* + * ucomisd: Compare unordered scalar double precision values + */ +#define x86_64_ucomisd_reg_reg(inst, dreg, sreg) \ + do { \ + x86_64_p1_xmm2_reg_reg_size((inst), 0x66, 0x0f, 0x2e, (dreg), (sreg), 0); \ + } while(0) + +#define x86_64_ucomisd_reg_regp(inst, dreg, sregp) \ + do { \ + x86_64_p1_xmm2_reg_regp_size((inst), 0x66, 0x0f, 0x2e, (dreg), (sregp), 0); \ + } while(0) + +#define x86_64_ucomisd_reg_mem(inst, dreg, mem) \ + do { \ + x86_64_p1_xmm2_reg_mem_size((inst), 0x66, 0x0f, 0x2e, (dreg), (mem), 0); \ + } while(0) + +#define x86_64_ucomisd_reg_membase(inst, dreg, basereg, disp) \ + do { \ + x86_64_p1_xmm2_reg_membase_size((inst), 0x66, 0x0f, 0x2e, (dreg), (basereg), (disp), 0); \ + } while(0) + +#define x86_64_ucomisd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ + do { \ + x86_64_p1_xmm2_reg_memindex_size((inst), 0x66, 0x0f, 0x2e, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + } while(0) + /* * Arithmetic opcodes */ @@ -4124,130 +4425,601 @@ typedef union } while(0) /* - * addsd: Add scalar double precision float values + * Macros for the logical operations with packed single precision values. */ -#define x86_64_addsd_reg_reg(inst, dreg, sreg) \ +#define x86_64_plops_reg_reg(inst, op, dreg, sreg) \ do { \ - x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x58, (dreg), (sreg), 0); \ + x86_64_xmm2_reg_reg((inst), 0x0f, (op), (dreg), (sreg)); \ } while(0) -#define x86_64_addsd_reg_regp(inst, dreg, sregp) \ +#define x86_64_plops_reg_regp(inst, op, dreg, sregp) \ do { \ - x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x58, (dreg), (sregp), 0); \ + x86_64_xmm2_reg_regp((inst), 0x0f, (op), (dreg), (sregp)); \ } while(0) -#define x86_64_addsd_reg_mem(inst, dreg, mem) \ +#define x86_64_plops_reg_mem(inst, op, dreg, mem) \ do { \ - x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x58, (dreg), (mem), 0); \ + x86_64_xmm2_reg_mem((inst), 0x0f, (op), (dreg), (mem)); \ } while(0) -#define x86_64_addsd_reg_membase(inst, dreg, basereg, disp) \ +#define x86_64_plops_reg_membase(inst, op, dreg, basereg, disp) \ do { \ - x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x58, (dreg), (basereg), (disp), 0); \ + x86_64_xmm2_reg_membase((inst), 0x0f, (op), (dreg), (basereg), (disp)); \ } while(0) -#define x86_64_addsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ +#define x86_64_plops_reg_memindex(inst, op, dreg, basereg, disp, indexreg, shift) \ do { \ - x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x58, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + x86_64_xmm2_reg_memindex((inst), 0x0f, (op), (dreg), (basereg), (disp), (indexreg), (shift)); \ } while(0) /* - * subsd: Substract scalar double precision float values + * andps: And */ -#define x86_64_subsd_reg_reg(inst, dreg, sreg) \ +#define x86_64_andps_reg_reg(inst, dreg, sreg) \ do { \ - x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x5c, (dreg), (sreg), 0); \ + x86_64_xmm2_reg_reg((inst), 0x0f, 0x54, (dreg), (sreg)); \ } while(0) -#define x86_64_subsd_reg_regp(inst, dreg, sregp) \ +#define x86_64_andps_reg_regp(inst, dreg, sregp) \ do { \ - x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x5c, (dreg), (sregp), 0); \ + x86_64_xmm2_reg_regp((inst), 0x0f, 0x54, (dreg), (sregp)); \ } while(0) -#define x86_64_subsd_reg_mem(inst, dreg, mem) \ +#define x86_64_andps_reg_mem(inst, dreg, mem) \ do { \ - x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x5c, (dreg), (mem), 0); \ + x86_64_xmm2_reg_mem((inst), 0x0f, 0x54, (dreg), (mem)); \ } while(0) -#define x86_64_subsd_reg_membase(inst, dreg, basereg, disp) \ +#define x86_64_andps_reg_membase(inst, dreg, basereg, disp) \ do { \ - x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x5c, (dreg), (basereg), (disp), 0); \ + x86_64_xmm2_reg_membase((inst), 0x0f, 0x54, (dreg), (basereg), (disp)); \ } while(0) -#define x86_64_subsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ +#define x86_64_andps_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ do { \ - x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x5c, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + x86_64_xmm2_reg_memindex((inst), 0x0f, 0x54, (dreg), (basereg), (disp), (indexreg), (shift)); \ } while(0) /* - * mulsd: Multiply scalar double precision float values + * orps: Or */ -#define x86_64_mulsd_reg_reg(inst, dreg, sreg) \ +#define x86_64_orps_reg_reg(inst, dreg, sreg) \ do { \ - x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x59, (dreg), (sreg), 0); \ + x86_64_xmm2_reg_reg((inst), 0x0f, 0x56, (dreg), (sreg)); \ } while(0) -#define x86_64_mulsd_reg_regp(inst, dreg, sregp) \ +#define x86_64_orps_reg_regp(inst, dreg, sregp) \ do { \ - x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x59, (dreg), (sregp), 0); \ + x86_64_xmm2_reg_regp((inst), 0x0f, 0x56, (dreg), (sregp)); \ } while(0) -#define x86_64_mulsd_reg_mem(inst, dreg, mem) \ +#define x86_64_orps_reg_mem(inst, dreg, mem) \ do { \ - x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x59, (dreg), (mem), 0); \ + x86_64_xmm2_reg_mem((inst), 0x0f, 0x56, (dreg), (mem)); \ } while(0) -#define x86_64_mulsd_reg_membase(inst, dreg, basereg, disp) \ +#define x86_64_orps_reg_membase(inst, dreg, basereg, disp) \ do { \ - x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x59, (dreg), (basereg), (disp), 0); \ + x86_64_xmm2_reg_membase((inst), 0x0f, 0x56, (dreg), (basereg), (disp)); \ } while(0) -#define x86_64_mulsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ +#define x86_64_orps_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ do { \ - x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x59, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + x86_64_xmm2_reg_memindex((inst), 0x0f, 0x56, (dreg), (basereg), (disp), (indexreg), (shift)); \ } while(0) /* - * divsd: Divide scalar double precision float values + * xorps: Xor */ -#define x86_64_divsd_reg_reg(inst, dreg, sreg) \ +#define x86_64_xorps_reg_reg(inst, dreg, sreg) \ do { \ - x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x5e, (dreg), (sreg), 0); \ + x86_64_xmm2_reg_reg((inst), 0x0f, 0x57, (dreg), (sreg)); \ } while(0) -#define x86_64_divsd_reg_regp(inst, dreg, sregp) \ +#define x86_64_xorps_reg_regp(inst, dreg, sregp) \ do { \ - x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x5e, (dreg), (sregp), 0); \ + x86_64_xmm2_reg_regp((inst), 0x0f, 0x57, (dreg), (sregp)); \ } while(0) -#define x86_64_divsd_reg_mem(inst, dreg, mem) \ +#define x86_64_xorps_reg_mem(inst, dreg, mem) \ do { \ - x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x5e, (dreg), (mem), 0); \ + x86_64_xmm2_reg_mem((inst), 0x0f, 0x57, (dreg), (mem)); \ } while(0) -#define x86_64_divsd_reg_membase(inst, dreg, basereg, disp) \ +#define x86_64_xorps_reg_membase(inst, dreg, basereg, disp) \ do { \ - x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x5e, (dreg), (basereg), (disp), 0); \ + x86_64_xmm2_reg_membase((inst), 0x0f, 0x57, (dreg), (basereg), (disp)); \ } while(0) -#define x86_64_divsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ +#define x86_64_xorps_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ do { \ - x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x5e, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + x86_64_xmm2_reg_memindex((inst), 0x0f, 0x57, (dreg), (basereg), (disp), (indexreg), (shift)); \ } while(0) /* - * fpu instructions + * maxss: Maximum value */ +#define x86_64_maxss_reg_reg(inst, dreg, sreg) \ + do { \ + x86_64_p1_xmm2_reg_reg_size((inst), 0xf3, 0x0f, 0x5f, (dreg), (sreg), 0); \ + } while(0) + +#define x86_64_maxss_reg_regp(inst, dreg, sregp) \ + do { \ + x86_64_p1_xmm2_reg_regp_size((inst), 0xf3, 0x0f, 0x5f, (dreg), (sregp), 0); \ + } while(0) + +#define x86_64_maxss_reg_mem(inst, dreg, mem) \ + do { \ + x86_64_p1_xmm2_reg_mem_size((inst), 0xf3, 0x0f, 0x5f, (dreg), (mem), 0); \ + } while(0) + +#define x86_64_maxss_reg_membase(inst, dreg, basereg, disp) \ + do { \ + x86_64_p1_xmm2_reg_membase_size((inst), 0xf3, 0x0f, 0x5f, (dreg), (basereg), (disp), 0); \ + } while(0) + +#define x86_64_maxss_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ + do { \ + x86_64_p1_xmm2_reg_memindex_size((inst), 0xf3, 0x0f, 0x5f, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + } while(0) /* - * fld + * minss: Minimum value */ +#define x86_64_minss_reg_reg(inst, dreg, sreg) \ + do { \ + x86_64_p1_xmm2_reg_reg_size((inst), 0xf3, 0x0f, 0x5d, (dreg), (sreg), 0); \ + } while(0) -#define x86_64_fld_regp_size(inst, sregp, size) \ +#define x86_64_minss_reg_regp(inst, dreg, sregp) \ do { \ - x86_64_rex_emit((inst), 0, 0, 0, (sregp)); \ - switch(size) \ - { \ + x86_64_p1_xmm2_reg_regp_size((inst), 0xf3, 0x0f, 0x5d, (dreg), (sregp), 0); \ + } while(0) + +#define x86_64_minss_reg_mem(inst, dreg, mem) \ + do { \ + x86_64_p1_xmm2_reg_mem_size((inst), 0xf3, 0x0f, 0x5d, (dreg), (mem), 0); \ + } while(0) + +#define x86_64_minss_reg_membase(inst, dreg, basereg, disp) \ + do { \ + x86_64_p1_xmm2_reg_membase_size((inst), 0xf3, 0x0f, 0x5d, (dreg), (basereg), (disp), 0); \ + } while(0) + +#define x86_64_minss_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ + do { \ + x86_64_p1_xmm2_reg_memindex_size((inst), 0xf3, 0x0f, 0x5d, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + } while(0) + +/* + * sqrtss: Square root + */ +#define x86_64_sqrtss_reg_reg(inst, dreg, sreg) \ + do { \ + x86_64_p1_xmm2_reg_reg_size((inst), 0xf3, 0x0f, 0x51, (dreg), (sreg), 0); \ + } while(0) + +#define x86_64_sqrtss_reg_regp(inst, dreg, sregp) \ + do { \ + x86_64_p1_xmm2_reg_regp_size((inst), 0xf3, 0x0f, 0x51, (dreg), (sregp), 0); \ + } while(0) + +#define x86_64_sqrtss_reg_mem(inst, dreg, mem) \ + do { \ + x86_64_p1_xmm2_reg_mem_size((inst), 0xf3, 0x0f, 0x51, (dreg), (mem), 0); \ + } while(0) + +#define x86_64_sqrtss_reg_membase(inst, dreg, basereg, disp) \ + do { \ + x86_64_p1_xmm2_reg_membase_size((inst), 0xf3, 0x0f, 0x51, (dreg), (basereg), (disp), 0); \ + } while(0) + +#define x86_64_sqrtss_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ + do { \ + x86_64_p1_xmm2_reg_memindex_size((inst), 0xf3, 0x0f, 0x51, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + } while(0) + + +/* + * Macros for the logical operations with packed double precision values. + */ +#define x86_64_plopd_reg_reg(inst, op, dreg, sreg) \ + do { \ + x86_64_p1_xmm2_reg_reg_size((inst), 0x66, 0x0f, (op), (dreg), (sreg), 0); \ + } while(0) + +#define x86_64_plopd_reg_regp(inst, op, dreg, sregp) \ + do { \ + x86_64_p1_xmm2_reg_regp_size((inst), 0x66, 0x0f, (op), (dreg), (sregp), 0); \ + } while(0) + +#define x86_64_plopd_reg_mem(inst, op, dreg, mem) \ + do { \ + x86_64_p1_xmm2_reg_mem_size((inst), 0x66, 0x0f, (op), (dreg), (mem), 0); \ + } while(0) + +#define x86_64_plopd_reg_membase(inst, op, dreg, basereg, disp) \ + do { \ + x86_64_p1_xmm2_reg_membase_size((inst), 0x66, 0x0f, (op), (dreg), (basereg), (disp), 0); \ + } while(0) + +#define x86_64_plopd_reg_memindex(inst, op, dreg, basereg, disp, indexreg, shift) \ + do { \ + x86_64_xmm2_reg_memindex_size((inst), 0x66, 0x0f, (op), (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + } while(0) + +/* + * addsd: Add scalar double precision float values + */ +#define x86_64_addsd_reg_reg(inst, dreg, sreg) \ + do { \ + x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x58, (dreg), (sreg), 0); \ + } while(0) + +#define x86_64_addsd_reg_regp(inst, dreg, sregp) \ + do { \ + x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x58, (dreg), (sregp), 0); \ + } while(0) + +#define x86_64_addsd_reg_mem(inst, dreg, mem) \ + do { \ + x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x58, (dreg), (mem), 0); \ + } while(0) + +#define x86_64_addsd_reg_membase(inst, dreg, basereg, disp) \ + do { \ + x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x58, (dreg), (basereg), (disp), 0); \ + } while(0) + +#define x86_64_addsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ + do { \ + x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x58, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + } while(0) + +/* + * subsd: Substract scalar double precision float values + */ +#define x86_64_subsd_reg_reg(inst, dreg, sreg) \ + do { \ + x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x5c, (dreg), (sreg), 0); \ + } while(0) + +#define x86_64_subsd_reg_regp(inst, dreg, sregp) \ + do { \ + x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x5c, (dreg), (sregp), 0); \ + } while(0) + +#define x86_64_subsd_reg_mem(inst, dreg, mem) \ + do { \ + x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x5c, (dreg), (mem), 0); \ + } while(0) + +#define x86_64_subsd_reg_membase(inst, dreg, basereg, disp) \ + do { \ + x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x5c, (dreg), (basereg), (disp), 0); \ + } while(0) + +#define x86_64_subsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ + do { \ + x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x5c, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + } while(0) + +/* + * mulsd: Multiply scalar double precision float values + */ +#define x86_64_mulsd_reg_reg(inst, dreg, sreg) \ + do { \ + x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x59, (dreg), (sreg), 0); \ + } while(0) + +#define x86_64_mulsd_reg_regp(inst, dreg, sregp) \ + do { \ + x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x59, (dreg), (sregp), 0); \ + } while(0) + +#define x86_64_mulsd_reg_mem(inst, dreg, mem) \ + do { \ + x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x59, (dreg), (mem), 0); \ + } while(0) + +#define x86_64_mulsd_reg_membase(inst, dreg, basereg, disp) \ + do { \ + x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x59, (dreg), (basereg), (disp), 0); \ + } while(0) + +#define x86_64_mulsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ + do { \ + x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x59, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + } while(0) + +/* + * divsd: Divide scalar double precision float values + */ +#define x86_64_divsd_reg_reg(inst, dreg, sreg) \ + do { \ + x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x5e, (dreg), (sreg), 0); \ + } while(0) + +#define x86_64_divsd_reg_regp(inst, dreg, sregp) \ + do { \ + x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x5e, (dreg), (sregp), 0); \ + } while(0) + +#define x86_64_divsd_reg_mem(inst, dreg, mem) \ + do { \ + x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x5e, (dreg), (mem), 0); \ + } while(0) + +#define x86_64_divsd_reg_membase(inst, dreg, basereg, disp) \ + do { \ + x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x5e, (dreg), (basereg), (disp), 0); \ + } while(0) + +#define x86_64_divsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ + do { \ + x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x5e, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + } while(0) + +/* + * andpd: And + */ +#define x86_64_andpd_reg_reg(inst, dreg, sreg) \ + do { \ + x86_64_p1_xmm2_reg_reg_size((inst), 0x66, 0x0f, 0x54, (dreg), (sreg), 0); \ + } while(0) + +#define x86_64_andpd_reg_regp(inst, dreg, sregp) \ + do { \ + x86_64_p1_xmm2_reg_regp_size((inst), 0x66, 0x0f, 0x54, (dreg), (sregp), 0); \ + } while(0) + +#define x86_64_andpd_reg_mem(inst, dreg, mem) \ + do { \ + x86_64_p1_xmm2_reg_mem_size((inst), 0x66, 0x0f, 0x54, (dreg), (mem), 0); \ + } while(0) + +#define x86_64_andpd_reg_membase(inst, dreg, basereg, disp) \ + do { \ + x86_64_p1_xmm2_reg_membase_size((inst), 0x66, 0x0f, 0x54, (dreg), (basereg), (disp), 0); \ + } while(0) + +#define x86_64_andpd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ + do { \ + x86_64_p1_xmm2_reg_memindex_size((inst), 0x66, 0x0f, 0x54, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + } while(0) + +/* + * orpd: Or + */ +#define x86_64_orpd_reg_reg(inst, dreg, sreg) \ + do { \ + x86_64_p1_xmm2_reg_reg_size((inst), 0x66, 0x0f, 0x56, (dreg), (sreg), 0); \ + } while(0) + +#define x86_64_orpd_reg_regp(inst, dreg, sregp) \ + do { \ + x86_64_p1_xmm2_reg_regp_size((inst), 0x66, 0x0f, 0x56, (dreg), (sregp), 0); \ + } while(0) + +#define x86_64_orpd_reg_mem(inst, dreg, mem) \ + do { \ + x86_64_p1_xmm2_reg_mem_size((inst), 0x66, 0x0f, 0x56, (dreg), (mem), 0); \ + } while(0) + +#define x86_64_orpd_reg_membase(inst, dreg, basereg, disp) \ + do { \ + x86_64_p1_xmm2_reg_membase_size((inst), 0x66, 0x0f, 0x56, (dreg), (basereg), (disp), 0); \ + } while(0) + +#define x86_64_orpd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ + do { \ + x86_64_p1_xmm2_reg_memindex_size((inst), 0x66, 0x0f, 0x56, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + } while(0) + +/* + * xorpd: Xor + */ +#define x86_64_xorpd_reg_reg(inst, dreg, sreg) \ + do { \ + x86_64_p1_xmm2_reg_reg_size((inst), 0x66, 0x0f, 0x57, (dreg), (sreg), 0); \ + } while(0) + +#define x86_64_xorpd_reg_regp(inst, dreg, sregp) \ + do { \ + x86_64_p1_xmm2_reg_regp_size((inst), 0x66, 0x0f, 0x57, (dreg), (sregp), 0); \ + } while(0) + +#define x86_64_xorpd_reg_mem(inst, dreg, mem) \ + do { \ + x86_64_p1_xmm2_reg_mem_size((inst), 0x66, 0x0f, 0x57, (dreg), (mem), 0); \ + } while(0) + +#define x86_64_xorpd_reg_membase(inst, dreg, basereg, disp) \ + do { \ + x86_64_p1_xmm2_reg_membase_size((inst), 0x66, 0x0f, 0x57, (dreg), (basereg), (disp), 0); \ + } while(0) + +#define x86_64_xorpd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ + do { \ + x86_64_p1_xmm2_reg_memindex_size((inst), 0x66, 0x0f, 0x57, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + } while(0) + +/* + * maxsd: Maximum value + */ +#define x86_64_maxsd_reg_reg(inst, dreg, sreg) \ + do { \ + x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x5f, (dreg), (sreg), 0); \ + } while(0) + +#define x86_64_maxsd_reg_regp(inst, dreg, sregp) \ + do { \ + x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x5f, (dreg), (sregp), 0); \ + } while(0) + +#define x86_64_maxsd_reg_mem(inst, dreg, mem) \ + do { \ + x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x5f, (dreg), (mem), 0); \ + } while(0) + +#define x86_64_maxsd_reg_membase(inst, dreg, basereg, disp) \ + do { \ + x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x5f, (dreg), (basereg), (disp), 0); \ + } while(0) + +#define x86_64_maxsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ + do { \ + x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x5f, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + } while(0) + +/* + * minsd: Minimum value + */ +#define x86_64_minsd_reg_reg(inst, dreg, sreg) \ + do { \ + x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x5d, (dreg), (sreg), 0); \ + } while(0) + +#define x86_64_minsd_reg_regp(inst, dreg, sregp) \ + do { \ + x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x5d, (dreg), (sregp), 0); \ + } while(0) + +#define x86_64_minsd_reg_mem(inst, dreg, mem) \ + do { \ + x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x5d, (dreg), (mem), 0); \ + } while(0) + +#define x86_64_minsd_reg_membase(inst, dreg, basereg, disp) \ + do { \ + x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x5d, (dreg), (basereg), (disp), 0); \ + } while(0) + +#define x86_64_minsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ + do { \ + x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2 0x0f, 0x5d, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + } while(0) + +/* + * sqrtsd: Square root + */ +#define x86_64_sqrtsd_reg_reg(inst, dreg, sreg) \ + do { \ + x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x51, (dreg), (sreg), 0); \ + } while(0) + +#define x86_64_sqrtsd_reg_regp(inst, dreg, sregp) \ + do { \ + x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x51, (dreg), (sregp), 0); \ + } while(0) + +#define x86_64_sqrtsd_reg_mem(inst, dreg, mem) \ + do { \ + x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x51, (dreg), (mem), 0); \ + } while(0) + +#define x86_64_sqrtsd_reg_membase(inst, dreg, basereg, disp) \ + do { \ + x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x51, (dreg), (basereg), (disp), 0); \ + } while(0) + +#define x86_64_sqrtsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \ + do { \ + x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x51, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + } while(0) + +/* + * Rounding: Available in SSE 4.1 only + */ + +/* + * roundss: Round scalar single precision value + */ +#define x86_64_roundss_reg_reg(inst, dreg, sreg, mode) \ + do { \ + x86_64_p1_xmm3_reg_reg_size((inst), 0x66, 0x0f, 0x3a, 0x0a, (dreg), (sreg), 0); \ + x86_imm_emit8((inst), (mode)); \ + } while(0) + +#define x86_64_roundss_reg_regp(inst, dreg, sregp, mode) \ + do { \ + x86_64_p1_xmm3_reg_regp_size((inst), 0x66, 0x0f, 0x3a, 0x0a, (dreg), (sregp), 0); \ + x86_imm_emit8((inst), (mode)); \ + } while(0) + +#define x86_64_roundss_reg_mem(inst, dreg, mem, mode) \ + do { \ + x86_64_p1_xmm3_reg_mem_size((inst), 0x66, 0x0f, 0x3a, 0x0a, (dreg), (mem), 0); \ + x86_imm_emit8((inst), (mode)); \ + } while(0) + +#define x86_64_roundss_reg_membase(inst, dreg, basereg, disp, mode) \ + do { \ + x86_64_p1_xmm3_reg_membase_size((inst), 0x66, 0x0f, 0x3a, 0x0a, (dreg), (basereg), (disp), 0); \ + x86_imm_emit8((inst), (mode)); \ + } while(0) + +#define x86_64_roundss_reg_memindex(inst, dreg, basereg, disp, indexreg, shift, mode) \ + do { \ + x86_64_p1_xmm3_reg_memindex_size((inst), 0x66, 0x0f, 0x3a, 0x0a, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + x86_imm_emit8((inst), (mode)); \ + } while(0) + +/* + * roundsd: Round scalar double precision value + */ +#define x86_64_roundsd_reg_reg(inst, dreg, sreg, mode) \ + do { \ + x86_64_p1_xmm3_reg_reg_size((inst), 0x66, 0x0f, 0x3a, 0x0b, (dreg), (sreg), 0); \ + x86_imm_emit8((inst), (mode)); \ + } while(0) + +#define x86_64_roundsd_reg_regp(inst, dreg, sregp, mode) \ + do { \ + x86_64_p1_xmm3_reg_regp_size((inst), 0x66, 0x0f, 0x3a, 0x0b, (dreg), (sregp), 0); \ + x86_imm_emit8((inst), (mode)); \ + } while(0) + +#define x86_64_roundsd_reg_mem(inst, dreg, mem, mode) \ + do { \ + x86_64_p1_xmm3_reg_mem_size((inst), 0x66, 0x0f, 0x3a, 0x0b, (dreg), (mem), 0); \ + x86_imm_emit8((inst), (mode)); \ + } while(0) + +#define x86_64_roundsd_reg_membase(inst, dreg, basereg, disp, mode) \ + do { \ + x86_64_p1_xmm3_reg_membase_size((inst), 0x66, 0x0f, 0x3a, 0x0b, (dreg), (basereg), (disp), 0); \ + x86_imm_emit8((inst), (mode)); \ + } while(0) + +#define x86_64_roundsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift, mode) \ + do { \ + x86_64_p1_xmm3_reg_memindex_size((inst), 0x66, 0x0f, 0x3a, 0x0b, (dreg), (basereg), (disp), (indexreg), (shift), 0); \ + x86_imm_emit8((inst), (mode)); \ + } while(0) + +/* + * Clear xmm register + */ +#define x86_64_clear_xreg(inst, reg) \ + do { \ + x86_64_xorps_reg_reg((inst), (reg), (reg)); \ + } while(0) + +/* + * fpu instructions + */ + +/* + * fld + */ + +#define x86_64_fld_regp_size(inst, sregp, size) \ + do { \ + x86_64_rex_emit((inst), 0, 0, 0, (sregp)); \ + switch(size) \ + { \ case 4: \ { \ *(inst)++ = (unsigned char)0xd9; \ @@ -4349,7 +5121,7 @@ typedef union /* * fild: Load an integer and convert it to long double */ -#define x86_fild_mem_size(inst, mem, size) \ +#define x86_64_fild_mem_size(inst, mem, size) \ do { \ switch(size) \ { \ @@ -4374,7 +5146,7 @@ typedef union } \ } while (0) -#define x86_fild_membase_size(inst, mem, size) \ +#define x86_64_fild_membase_size(inst, basereg, disp, size) \ do { \ x86_64_rex_emit((inst), 0, 0, 0, (basereg)); \ switch(size) \ @@ -4486,7 +5258,6 @@ typedef union /* * fstp: store top fpu register to memory and pop it from the fpu stack */ - #define x86_64_fstp_regp_size(inst, sregp, size) \ do { \ x86_64_rex_emit((inst), 0, 0, 0, (sregp)); \ @@ -4591,7 +5362,7 @@ typedef union } while(0) /* - * Convert long double to integer + * fistp: Convert long double to integer */ #define x86_64_fistp_mem_size(inst, mem, size) \ do { \ @@ -4618,8 +5389,35 @@ typedef union } \ } while(0) +#define x86_64_fistp_regp_size(inst, dregp, size) \ + do { \ + x86_64_rex_emit((inst), 0, 0, 0, (dregp)); \ + switch((size)) \ + { \ + case 2: \ + { \ + *(inst)++ = (unsigned char)0xdf; \ + x86_64_regp_emit((inst), 3, (dregp)); \ + } \ + break; \ + case 4: \ + { \ + *(inst)++ = (unsigned char)0xdb; \ + x86_64_regp_emit((inst), 3, (dregp)); \ + } \ + break; \ + case 8: \ + { \ + *(inst)++ = (unsigned char)0xdf; \ + x86_64_regp_emit((inst), 7, (dregp)); \ + } \ + break; \ + } \ + } while(0) + #define x86_64_fistp_membase_size(inst, basereg, disp, size) \ do { \ + x86_64_rex_emit((inst), 0, 0, 0, (basereg)); \ switch((size)) \ { \ case 2: \ @@ -4643,6 +5441,107 @@ typedef union } \ } while(0) +/* + * frndint: Round st(0) to integer according to the rounding mode set in the fpu control word. + */ +#define x86_64_frndint(inst) \ + do { \ + *(inst)++ = (unsigned char)0xd9; \ + *(inst)++ = (unsigned char)0xfc; \ + } while(0) + +/* + * fisttp: Convert long double to integer using truncation as rounding mode Available in SSE 3 only + */ +#define x86_64_fisttp_regp_size(inst, dregp, size) \ + do { \ + x86_64_rex_emit((inst), 0, 0, 0, (dregp)); \ + switch((size)) \ + { \ + case 2: \ + { \ + *(inst)++ = (unsigned char)0xdf; \ + x86_64_regp_emit((inst), 1, (dregp)); \ + } \ + break; \ + case 4: \ + { \ + *(inst)++ = (unsigned char)0xdb; \ + x86_64_regp_emit((inst), 1, (dregp)); \ + } \ + break; \ + case 8: \ + { \ + *(inst)++ = (unsigned char)0xdd; \ + x86_64_regp_emit((inst), 1, (dregp)); \ + } \ + break; \ + } \ + } while(0) + +#define x86_64_fisttp_mem_size(inst, mem, size) \ + do { \ + switch((size)) \ + { \ + case 2: \ + { \ + *(inst)++ = (unsigned char)0xdf; \ + x86_64_mem_emit((inst), 1, (mem)); \ + } \ + break; \ + case 4: \ + { \ + *(inst)++ = (unsigned char)0xdb; \ + x86_64_mem_emit((inst), 1, (mem)); \ + } \ + break; \ + case 8: \ + { \ + *(inst)++ = (unsigned char)0xdd; \ + x86_64_mem_emit((inst), 1, (mem)); \ + } \ + break; \ + } \ + } while(0) + +#define x86_64_fisttp_membase_size(inst, basereg, disp, size) \ + do { \ + x86_64_rex_emit((inst), 0, 0, 0, (basereg)); \ + switch((size)) \ + { \ + case 2: \ + { \ + *(inst)++ = (unsigned char)0xdf; \ + x86_64_membase_emit((inst), 1, (basereg), (disp)); \ + } \ + break; \ + case 4: \ + { \ + *(inst)++ = (unsigned char)0xdb; \ + x86_64_membase_emit((inst), 1, (basereg), (disp)); \ + } \ + break; \ + case 8: \ + { \ + *(inst)++ = (unsigned char)0xdd; \ + x86_64_membase_emit((inst), 1, (basereg), (disp)); \ + } \ + break; \ + } \ + } while(0) + +#define x86_64_fabs(inst) \ + do { \ + *(inst)++ = (unsigned char)0xd9; \ + *(inst)++ = (unsigned char)0xe1; \ + } while(0) + +#define x86_64_fchs(inst) \ + do { \ + *(inst)++ = (unsigned char)0xd9; \ + *(inst)++ = (unsigned char)0xe0; \ + } while(0) + /* * Store fpu control word after checking for pending unmasked fpu exceptions */ @@ -4650,7 +5549,7 @@ typedef union do { \ *(inst)++ = (unsigned char)0xd9; \ x86_64_mem_emit((inst), 7, (mem)); \ - } while (0) + } while(0) #define x86_64_fnstcw_membase(inst, basereg, disp) \ do { \ diff --git a/jit/jit-rules-x86-64.c b/jit/jit-rules-x86-64.c index bcb15bc..8bb5e48 100644 --- a/jit/jit-rules-x86-64.c +++ b/jit/jit-rules-x86-64.c @@ -103,6 +103,22 @@ */ #define HAVE_RED_ZONE 1 +/* + * Some declarations that should be replaced by querying the cpuinfo + * if generating code for the current cpu. + */ +/* +#define HAVE_X86_SSE_4_1 0 +#define HAVE_X86_SSE_4 0 +#define HAVE_X86_SSE_3 0 +#define HAVE_X86_FISTTP 0 +*/ + +#define TODO() \ +do { \ + fprintf(stderr, "TODO at %s, %d\n", __FILE__, (int)__LINE__); \ +} while(0) + /* * Setup or teardown the x86 code output process. */ @@ -165,6 +181,9 @@ static int _jit_sse_return_regs[] = {X86_64_REG_XMM0, X86_64_REG_XMM1}; static _jit_regclass_t *x86_64_reg; /* X86_64 general purpose registers */ static _jit_regclass_t *x86_64_creg; /* X86_64 call clobbered general */ /* purpose registers */ +static _jit_regclass_t *x86_64_dreg; /* general purpose registers that */ + /* can be used as divisor */ + /* (all but %rax and %rdx) */ static _jit_regclass_t *x86_64_rreg; /* general purpose registers not used*/ /* for returning values */ static _jit_regclass_t *x86_64_sreg; /* general purpose registers that can*/ @@ -196,6 +215,16 @@ _jit_init_backend(void) X86_64_REG_R9, X86_64_REG_R10, X86_64_REG_R11); + /* r egister class for divisors */ + x86_64_dreg = _jit_regclass_create( + "dreg", JIT_REG_WORD | JIT_REG_LONG, 12, + X86_64_REG_RCX, X86_64_REG_RBX, + X86_64_REG_RSI, X86_64_REG_RDI, + X86_64_REG_R8, X86_64_REG_R9, + X86_64_REG_R10, X86_64_REG_R11, + X86_64_REG_R12, X86_64_REG_R13, + X86_64_REG_R14, X86_64_REG_R15); + /* register class with all registers not used for returning values */ x86_64_rreg = _jit_regclass_create( "rreg", JIT_REG_WORD | JIT_REG_LONG, 12, @@ -340,6 +369,452 @@ _jit_xmm1_reg_imm_size_float64(jit_gencode_t gen, unsigned char **inst_ptr, return 1; } +/* + * Do a logical xmm operation with packed float32 values + */ +static int +_jit_plops_reg_imm(jit_gencode_t gen, unsigned char **inst_ptr, + X86_64_XMM_PLOP opc, int reg, void *packed_value) +{ + void *ptr; + jit_nint offset; + unsigned char *inst; + + inst = *inst_ptr; + ptr = _jit_cache_alloc(&(gen->posn), 16); + if(!ptr) + { + return 0; + } + jit_memcpy(ptr, packed_value, 16); + + /* calculate the offset for membase addressing */ + offset = (jit_nint)ptr - ((jit_nint)inst + (reg > 7 ? 8 : 7)); + if((offset >= jit_min_int) && (offset <= jit_max_int)) + { + /* We can use RIP relative addressing here */ + x86_64_plops_reg_membase(inst, opc, reg, X86_64_RIP, offset); + *inst_ptr = inst; + return 1; + } + /* Check if mem addressing can be used */ + if(((jit_nint)ptr >= jit_min_int) && + ((jit_nint)ptr <= jit_max_int)) + { + /* We can use absolute addressing */ + x86_64_plops_reg_mem(inst, opc, reg, (jit_nint)ptr); + *inst_ptr = inst; + return 1; + } + /* We have to use an extra general register */ + TODO(); + return 0; +} + +/* + * Do a logical xmm operation with packed float64 values + */ +static int +_jit_plopd_reg_imm(jit_gencode_t gen, unsigned char **inst_ptr, + X86_64_XMM_PLOP opc, int reg, void *packed_value) +{ + void *ptr; + jit_nint offset; + unsigned char *inst; + + inst = *inst_ptr; + ptr = _jit_cache_alloc(&(gen->posn), 16); + if(!ptr) + { + return 0; + } + jit_memcpy(ptr, packed_value, 16); + + /* calculate the offset for membase addressing */ + offset = (jit_nint)ptr - ((jit_nint)inst + (reg > 7 ? 9 : 8)); + if((offset >= jit_min_int) && (offset <= jit_max_int)) + { + /* We can use RIP relative addressing here */ + x86_64_plopd_reg_membase(inst, opc, reg, X86_64_RIP, offset); + *inst_ptr = inst; + return 1; + } + /* Check if mem addressing can be used */ + if(((jit_nint)ptr >= jit_min_int) && + ((jit_nint)ptr <= jit_max_int)) + { + /* We can use absolute addressing */ + x86_64_plopd_reg_mem(inst, opc, reg, (jit_nint)ptr); + *inst_ptr = inst; + return 1; + } + /* We have to use an extra general register */ + TODO(); + return 0; +} + +/* + * Helpers for saving and setting roundmode in the fpu control word + * and restoring it afterwards. + * The rounding mode bits are bit 10 and 11 in the fpu control word. + * sp_offset is the start offset of a temporary eight byte block. + */ +static unsigned char * +_x86_64_set_fpu_roundmode(unsigned char *inst, int scratch_reg, + int sp_offset, X86_64_ROUNDMODE mode) +{ + int fpcw_save_offset = sp_offset + 4; + int fpcw_new_offset = sp_offset; + int round_mode = ((int)mode) << 10; + int round_mode_mask = ~(((int)X86_ROUND_ZERO) << 10); + + /* store FPU control word */ + x86_64_fnstcw_membase(inst, X86_64_RSP, fpcw_save_offset); + /* load the value into the scratch register */ + x86_64_mov_reg_membase_size(inst, scratch_reg, X86_64_RSP, fpcw_save_offset, 2); + /* Set the rounding mode */ + if(mode != X86_ROUND_ZERO) + { + /* Not all bits are set in the mask so we have to clear it first */ + x86_64_and_reg_imm_size(inst, scratch_reg, round_mode_mask, 2); + } + x86_64_or_reg_imm_size(inst, scratch_reg, round_mode, 2); + /* Store the new round mode */ + x86_64_mov_membase_reg_size(inst, X86_64_RSP, fpcw_new_offset, scratch_reg, 2); + /* Now load the new control word */ + x86_64_fldcw_membase(inst, X86_64_RSP, fpcw_new_offset); + + return inst; +} + +static unsigned char * +_x86_64_restore_fpcw(unsigned char *inst, int sp_offset) +{ + int fpcw_save_offset = sp_offset + 4; + + /* Now load the saved control word */ + x86_64_fldcw_membase(inst, X86_64_RSP, fpcw_save_offset); + + return inst; +} + +/* + * Helpers for saving and setting roundmode in the mxcsr register and + * restoring it afterwards. + * The rounding mode bits are bit 13 and 14 in the mxcsr register. + * sp_offset is the start offset of a temporary eight byte block. + */ +static unsigned char * +_x86_64_set_xmm_roundmode(unsigned char *inst, int scratch_reg, + int sp_offset, X86_64_ROUNDMODE mode) +{ + int mxcsr_save_offset = sp_offset + 4; + int mxcsr_new_offset = sp_offset; + int round_mode = ((int)mode) << 13; + int round_mode_mask = ~(((int)X86_ROUND_ZERO) << 13); + + /* save the mxcsr register */ + x86_64_stmxcsr_membase(inst, X86_64_RSP, mxcsr_save_offset); + /* Load the contents of the mxcsr register into the scratch register */ + x86_64_mov_reg_membase_size(inst, scratch_reg, X86_64_RSP, mxcsr_save_offset, 4); + /* Set the rounding mode */ + if(mode != X86_ROUND_ZERO) + { + /* Not all bits are set in the mask so we have to clear it first */ + x86_64_and_reg_imm_size(inst, scratch_reg, round_mode_mask, 4); + } + x86_64_or_reg_imm_size(inst, scratch_reg, round_mode, 4); + /* Store the new round mode */ + x86_64_mov_membase_reg_size(inst, X86_64_RSP, mxcsr_new_offset, scratch_reg, 4); + /* and load it to the mxcsr register */ + x86_64_ldmxcsr_membase(inst, X86_64_RSP, mxcsr_new_offset); + + return inst; +} + +static unsigned char * +_x86_64_restore_mxcsr(unsigned char *inst, int sp_offset) +{ + int mxcsr_save_offset = sp_offset + 4; + + /* restore the mxcsr register */ + x86_64_ldmxcsr_membase(inst, X86_64_RSP, mxcsr_save_offset); + + return inst; +} + +/* + * perform rounding of scalar single precision values. + * We have to use the fpu where see4.1 is not supported. + */ +static unsigned char * +x86_64_rounds_reg_reg(unsigned char *inst, int dreg, int sreg, + int scratch_reg, X86_64_ROUNDMODE mode) +{ +#ifdef HAVE_RED_ZONE +#ifdef HAVE_X86_SSE_4_1 + x86_64_roundss_reg_reg(inst, dreg, sreg, mode); +#else + /* Copy the xmm register to the stack */ + x86_64_movss_membase_reg(inst, X86_64_RSP, -16, sreg); + /* Set the fpu round mode */ + inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, -8, mode); + /* Load the value to the fpu */ + x86_64_fld_membase_size(inst, X86_64_RSP, -16, 4); + /* And round it to integer */ + x86_64_frndint(inst); + /* restore the fpu control word */ + inst = _x86_64_restore_fpcw(inst, -8); + /* and move st(0) to the destination register */ + x86_64_fstp_membase_size(inst, X86_64_RSP, -16, 4); + x86_64_movss_reg_membase(inst, dreg, X86_64_RSP, -16); +#endif +#else +#ifdef HAVE_X86_SSE_4_1 + x86_64_roundss_reg_reg(inst, dreg, sreg, mode); +#else + /* allocate space on the stack for two ints and one long value */ + x86_64_sub_reg_imm_size(inst, X86_64_RSP, 16, 8); + /* Copy the xmm register to the stack */ + x86_64_movss_regp_reg(inst, X86_64_RSP, sreg); + /* Set the fpu round mode */ + inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, 8, mode); + /* Load the value to the fpu */ + x86_64_fld_regp_size(inst, X86_64_RSP, 4); + /* And round it to integer */ + x86_64_frndint(inst); + /* restore the fpu control word */ + inst = _x86_64_restore_fpcw(inst, 8); + /* and move st(0) to the destination register */ + x86_64_fstp_regp_size(inst, X86_64_RSP, 4); + x86_64_movss_reg_regp(inst, dreg, X86_64_RSP); + /* restore the stack pointer */ + x86_64_add_reg_imm_size(inst, X86_64_RSP, 16, 8); +#endif +#endif + return inst; +} + +static unsigned char * +x86_64_rounds_reg_membase(unsigned char *inst, int dreg, int offset, + int scratch_reg, X86_64_ROUNDMODE mode) +{ +#ifdef HAVE_RED_ZONE +#ifdef HAVE_X86_SSE_4_1 + x86_64_roundss_reg_membase(inst, dreg, X86_64_RBP, offset, mode); +#else + /* Load the value to the fpu */ + x86_64_fld_membase_size(inst, X86_64_RBP, offset, 4); + /* Set the fpu round mode */ + inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, -8, mode); + /* And round it to integer */ + x86_64_frndint(inst); + /* restore the fpu control word */ + inst = _x86_64_restore_fpcw(inst, -8); + /* and move st(0) to the destination register */ + x86_64_fstp_membase_size(inst, X86_64_RSP, -16, 4); + x86_64_movss_reg_membase(inst, dreg, X86_64_RSP, -16); +#endif +#else +#ifdef HAVE_X86_SSE_4_1 + x86_64_roundss_reg_membase(inst, dreg, X86_64_RBP, offset, mode); +#else + /* allocate space on the stack for two ints and one long value */ + x86_64_sub_reg_imm_size(inst, X86_64_RSP, 16, 8); + /* Load the value to the fpu */ + x86_64_fld_membase_size(inst, X86_64_RBP, offset, 4); + /* Set the fpu round mode */ + inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, 8, mode); + /* And round it to integer */ + x86_64_frndint(inst); + /* restore the fpu control word */ + inst = _x86_64_restore_fpcw(inst, 8); + /* and move st(0) to the destination register */ + x86_64_fstp_regp_size(inst, X86_64_RSP, 4); + x86_64_movss_reg_regp(inst, dreg, X86_64_RSP); + /* restore the stack pointer */ + x86_64_add_reg_imm_size(inst, X86_64_RSP, 16, 8); +#endif +#endif + return inst; +} + +/* + * perform rounding of scalar double precision values. + * We have to use the fpu where see4.1 is not supported. + */ +static unsigned char * +x86_64_roundd_reg_reg(unsigned char *inst, int dreg, int sreg, + int scratch_reg, X86_64_ROUNDMODE mode) +{ +#ifdef HAVE_RED_ZONE +#ifdef HAVE_X86_SSE_4_1 + x86_64_roundsd_reg_reg(inst, dreg, sreg, mode); +#else + /* Copy the xmm register to the stack */ + x86_64_movsd_membase_reg(inst, X86_64_RSP, -16, sreg); + /* Set the fpu round mode */ + inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, -8, mode); + /* Load the value to the fpu */ + x86_64_fld_membase_size(inst, X86_64_RSP, -16, 8); + /* And round it to integer */ + x86_64_frndint(inst); + /* restore the fpu control word */ + inst = _x86_64_restore_fpcw(inst, -8); + /* and move st(0) to the destination register */ + x86_64_fstp_membase_size(inst, X86_64_RSP, -16, 8); + x86_64_movsd_reg_membase(inst, dreg, X86_64_RSP, -16); +#endif +#else +#ifdef HAVE_X86_SSE_4_1 + x86_64_roundsd_reg_reg(inst, dreg, sreg, mode); +#else + /* allocate space on the stack for two ints and one long value */ + x86_64_sub_reg_imm_size(inst, X86_64_RSP, 16, 8); + /* Copy the xmm register to the stack */ + x86_64_movsd_regp_reg(inst, X86_64_RSP, sreg); + /* Set the fpu round mode */ + inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, 8, mode); + /* Load the value to the fpu */ + x86_64_fld_regp_size(inst, X86_64_RSP, 8); + /* And round it to integer */ + x86_64_frndint(inst); + /* restore the fpu control word */ + inst = _x86_64_restore_fpcw(inst, 8); + /* and move st(0) to the destination register */ + x86_64_fstp_regp_size(inst, X86_64_RSP, 8); + x86_64_movsd_reg_regp(inst, dreg, X86_64_RSP); + /* restore the stack pointer */ + x86_64_add_reg_imm_size(inst, X86_64_RSP, 16, 8); +#endif +#endif + return inst; +} + +static unsigned char * +x86_64_roundd_reg_membase(unsigned char *inst, int dreg, int offset, + int scratch_reg, X86_64_ROUNDMODE mode) +{ +#ifdef HAVE_RED_ZONE +#ifdef HAVE_X86_SSE_4_1 + x86_64_roundsd_reg_membase(inst, dreg, X86_64_RBP, offset, mode); +#else + /* Load the value to the fpu */ + x86_64_fld_membase_size(inst, X86_64_RBP, offset, 8); + /* Set the fpu round mode */ + inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, -8, mode); + /* And round it to integer */ + x86_64_frndint(inst); + /* restore the fpu control word */ + inst = _x86_64_restore_fpcw(inst, -8); + /* and move st(0) to the destination register */ + x86_64_fstp_membase_size(inst, X86_64_RSP, -16, 8); + x86_64_movsd_reg_membase(inst, dreg, X86_64_RSP, -16); +#endif +#else +#ifdef HAVE_X86_SSE_4_1 + x86_64_roundsd_reg_membase(inst, dreg, X86_64_RBP, offset, mode); +#else + /* allocate space on the stack for two ints and one long value */ + x86_64_sub_reg_imm_size(inst, X86_64_RSP, 16, 8); + /* Load the value to the fpu */ + x86_64_fld_membase_size(inst, X86_64_RBP, offset, 8); + /* Set the fpu round mode */ + inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, 8, mode); + /* And round it to integer */ + x86_64_frndint(inst); + /* restore the fpu control word */ + inst = _x86_64_restore_fpcw(inst, 8); + /* and move st(0) to the destination register */ + x86_64_fstp_regp_size(inst, X86_64_RSP, 8); + x86_64_movsd_reg_regp(inst, dreg, X86_64_RSP); + /* restore the stack pointer */ + x86_64_add_reg_imm_size(inst, X86_64_RSP, 16, 8); +#endif +#endif + return inst; +} + +/* + * Round the value in St(0) to integer according to the rounding + * mode specified. + */ +static unsigned char * +x86_64_roundnf(unsigned char *inst, int scratch_reg, X86_64_ROUNDMODE mode) +{ +#ifdef HAVE_RED_ZONE + /* Set the fpu round mode */ + inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, -8, mode); + /* And round it to integer */ + x86_64_frndint(inst); + /* restore the fpu control word */ + inst = _x86_64_restore_fpcw(inst, -8); +#else + /* allocate space on the stack for two ints and one long value */ + x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8); + /* Set the fpu round mode */ + inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, 0, mode); + /* And round it to integer */ + x86_64_frndint(inst); + /* restore the fpu control word */ + inst = _x86_64_restore_fpcw(inst, 0); + /* restore the stack pointer */ + x86_64_add_reg_imm_size(inst, X86_64_RSP, 8, 8); +#endif + return inst; +} + +/* + * Round the value in the fpu register st(0) to integer and + * store the value in dreg. St(0) is popped from the fpu stack. + */ +static unsigned char * +x86_64_nfloat_to_int(unsigned char *inst, int dreg, int scratch_reg, int size) +{ +#ifdef HAVE_RED_ZONE +#ifdef HAVE_X86_FISTTP + /* convert float to int */ + x86_64_fisttp_membase_size(inst, X86_64_RSP, -8, 4); + /* move result to the destination */ + x86_64_mov_reg_membase_size(inst, dreg, X86_64_RSP, -8, 4); +#else + /* Set the fpu round mode */ + inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, -8, X86_ROUND_ZERO); + /* And round the value in st(0) to integer and store it on the stack */ + x86_64_fistp_membase_size(inst, X86_64_RSP, -16, size); + /* restore the fpu control word */ + inst = _x86_64_restore_fpcw(inst, -8); + /* and load the integer to the destination register */ + x86_64_mov_reg_membase_size(inst, dreg, X86_64_RSP, -16, size); +#endif +#else +#ifdef HAVE_X86_FISTTP + /* allocate space on the stack for one long value */ + x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8); + /* convert float to int */ + x86_64_fisttp_regp_size(inst, X86_64_RSP, 4); + /* move result to the destination */ + x86_64_mov_reg_regp_size(inst, dreg, X86_64_RSP, 4); + /* restore the stack pointer */ + x86_64_add_reg_imm_size(inst, X86_64_RSP, 8, 8); +#else + /* allocate space on the stack for 2 ints and one long value */ + x86_64_sub_reg_imm_size(inst, X86_64_RSP, 16, 8); + /* Set the fpu round mode */ + inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, 8, X86_ROUND_ZERO); + /* And round the value in st(0) to integer and store it on the stack */ + x86_64_fistp_regp_size(inst, X86_64_RSP, size); + /* restore the fpu control word */ + inst = _x86_64_restore_fpcw(inst, 8); + /* and load the integer to the destination register */ + x86_64_mov_reg_regp_size(inst, dreg, X86_64_RSP, size); + /* restore the stack pointer */ + x86_64_add_reg_imm_size(inst, X86_64_RSP, 16, 8); +#endif +#endif + return inst; +} + /* * Call a function */ @@ -1049,8 +1524,15 @@ _jit_gen_load_value(jit_gencode_t gen, int reg, int other_reg, jit_value_t value { int xmm_reg = _jit_reg_info[reg].cpu_reg; - _jit_xmm1_reg_imm_size_float32(gen, &inst, XMM1_MOV, - xmm_reg, &float32_value); + if(float32_value == (jit_float32) 0.0) + { + x86_64_clear_xreg(inst, xmm_reg); + } + else + { + _jit_xmm1_reg_imm_size_float32(gen, &inst, XMM1_MOV, + xmm_reg, &float32_value); + } } else { @@ -1069,7 +1551,7 @@ _jit_gen_load_value(jit_gencode_t gen, int reg, int other_reg, jit_value_t value ptr = _jit_cache_alloc(&(gen->posn), sizeof(jit_float32)); jit_memcpy(ptr, &float32_value, sizeof(float32_value)); - offset = (jit_nint)ptr - ((jit_nint)inst + 7); + offset = (jit_nint)ptr - ((jit_nint)inst + 6); if((offset >= jit_min_int) && (offset <= jit_max_int)) { /* We can use RIP relative addressing here */ @@ -1084,7 +1566,7 @@ _jit_gen_load_value(jit_gencode_t gen, int reg, int other_reg, jit_value_t value else { /* We have to use an extra general register */ - /* TODO */ + TODO(); } } } @@ -1111,8 +1593,15 @@ _jit_gen_load_value(jit_gencode_t gen, int reg, int other_reg, jit_value_t value { int xmm_reg = _jit_reg_info[reg].cpu_reg; - _jit_xmm1_reg_imm_size_float64(gen, &inst, XMM1_MOV, - xmm_reg, &float64_value); + if(float64_value == (jit_float64) 0.0) + { + x86_64_clear_xreg(inst, xmm_reg); + } + else + { + _jit_xmm1_reg_imm_size_float64(gen, &inst, XMM1_MOV, + xmm_reg, &float64_value); + } } else { @@ -1131,7 +1620,7 @@ _jit_gen_load_value(jit_gencode_t gen, int reg, int other_reg, jit_value_t value ptr = _jit_cache_alloc(&(gen->posn), sizeof(jit_float64)); jit_memcpy(ptr, &float64_value, sizeof(float64_value)); - offset = (jit_nint)ptr - ((jit_nint)inst + 7); + offset = (jit_nint)ptr - ((jit_nint)inst + 6); if((offset >= jit_min_int) && (offset <= jit_max_int)) { /* We can use RIP relative addressing here */ @@ -1146,7 +1635,7 @@ _jit_gen_load_value(jit_gencode_t gen, int reg, int other_reg, jit_value_t value else { /* We have to use an extra general register */ - /* TODO */ + TODO(); } } } @@ -1192,7 +1681,7 @@ _jit_gen_load_value(jit_gencode_t gen, int reg, int other_reg, jit_value_t value else { /* We have to use an extra general register */ - /* TODO */ + TODO(); } } else @@ -1212,7 +1701,7 @@ _jit_gen_load_value(jit_gencode_t gen, int reg, int other_reg, jit_value_t value ptr = _jit_cache_alloc(&(gen->posn), sizeof(jit_nfloat)); jit_memcpy(ptr, &nfloat_value, sizeof(nfloat_value)); - offset = (jit_nint)ptr - ((jit_nint)inst + 7); + offset = (jit_nint)ptr - ((jit_nint)inst + 6); if((offset >= jit_min_int) && (offset <= jit_max_int)) { /* We can use RIP relative addressing here */ @@ -1241,7 +1730,7 @@ _jit_gen_load_value(jit_gencode_t gen, int reg, int other_reg, jit_value_t value else { /* We have to use an extra general register */ - /* TODO */ + TODO(); } } } @@ -2315,11 +2804,6 @@ flush_return_struct(unsigned char *inst, jit_value_t value) return inst; } -#define TODO() \ - do { \ - fprintf(stderr, "TODO at %s, %d\n", __FILE__, (int)__LINE__); \ - } while (0) - void _jit_gen_insn(jit_gencode_t gen, jit_function_t func, jit_block_t block, jit_insn_t insn) diff --git a/jit/jit-rules-x86-64.ins b/jit/jit-rules-x86-64.ins index ccf7e08..659e98f 100644 --- a/jit/jit-rules-x86-64.ins +++ b/jit/jit-rules-x86-64.ins @@ -22,6 +22,7 @@ %regclass reg x86_64_reg %regclass creg x86_64_creg +%regclass dreg x86_64_dreg %regclass rreg x86_64_rreg %regclass sreg x86_64_sreg %regclass freg x86_64_freg @@ -91,62 +92,106 @@ JIT_OP_EXPAND_UINT: x86_64_mov_reg_reg_size(inst, $1, $2, 4); } +JIT_OP_INT_TO_NFLOAT: + [=freg, local] -> { + x86_64_fild_membase_size(inst, X86_64_RBP, $2, 4); + } + [=freg, reg] -> { +#ifdef HAVE_RED_ZONE + x86_64_mov_membase_reg_size(inst, X86_64_RSP, -8, $2, 4); + x86_64_fild_membase_size(inst, X86_64_RSP, -8, 4); +#else + x86_64_push_reg_size(inst, $2, 8); + x86_64_fild_membase_size(inst, X86_64_RSP, 0, 4); + x86_64_add_reg_imm_size(inst, X86_64_RSP, sizeof(jit_nint), 8); +#endif + } + +JIT_OP_LONG_TO_NFLOAT: + [=freg, local] -> { + x86_64_fild_membase_size(inst, X86_64_RBP, $2, 8); + } + [=freg, reg] -> { +#ifdef HAVE_RED_ZONE + x86_64_mov_membase_reg_size(inst, X86_64_RSP, -8, $2, 8); + x86_64_fild_membase_size(inst, X86_64_RSP, -8, 8); +#else + x86_64_push_reg_size(inst, $2, 8); + x86_64_fild_membase_size(inst, X86_64_RSP, 0, 8); + x86_64_add_reg_imm_size(inst, X86_64_RSP, sizeof(jit_nint), 8); +#endif + } + JIT_OP_NFLOAT_TO_INT: stack - [=reg, freg] -> { - /* allocate space on the stack for 2 shorts and 1 int */ + [=reg, freg, scratch reg] -> { + inst = x86_64_nfloat_to_int(inst, $1, $3, 4); + } + +JIT_OP_NFLOAT_TO_LONG: stack + [=reg, freg, scratch reg] -> { + inst = x86_64_nfloat_to_int(inst, $1, $3, 8); + } + +JIT_OP_FLOAT32_TO_NFLOAT: + [=freg, local] -> { + x86_64_fld_membase_size(inst, X86_64_RBP, $2, 4); + } + [=freg, xreg] -> { +#ifdef HAVE_RED_ZONE + x86_64_movss_membase_reg(inst, X86_64_RSP, -8, $2); + x86_64_fld_membase_size(inst, X86_64_RSP, -8, 4); +#else x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8); - /* store FPU control word */ - x86_64_fnstcw_membase(inst, X86_64_RSP, 0); - /* set "round toward zero" mode */ - x86_64_mov_reg_membase_size(inst, $1, X86_64_RSP, 0, 2); - x86_64_or_reg_imm_size(inst, $1, 0xc00, 2); - x86_64_mov_membase_reg_size(inst, X86_64_RSP, 2, $1, 2); - x86_64_fldcw_membase(inst, X86_64_RSP, 2); - /* convert float to int */ - x86_64_fistp_membase_size(inst, X86_64_RSP, 4, 4); - /* restore FPU control word */ - x86_64_fldcw_membase(inst, X86_64_RSP, 0); - /* move result to the destination */ - x86_64_mov_reg_membase_size(inst, $1, X86_64_RSP, 4, 4); - /* restore the stack */ + x86_64_movss_regp_reg(inst, X86_64_RSP, $2); + x86_64_fld_regp_size(inst, X86_64_RSP, 4); x86_64_add_reg_imm_size(inst, X86_64_RSP, 8, 8); +#endif } -JIT_OP_NFLOAT_TO_LONG: stack - [=reg, freg] -> { - /* allocate space on the stack for 2 shorts and 1 long */ - x86_64_sub_reg_imm_size(inst, X86_64_RSP, 12, 8); - /* store FPU control word */ - x86_64_fnstcw_membase(inst, X86_64_RSP, 0); - /* set "round toward zero" mode */ - x86_64_mov_reg_membase_size(inst, $1, X86_64_RSP, 0, 2); - x86_64_or_reg_imm_size(inst, $1, 0xc00, 2); - x86_64_mov_membase_reg_size(inst, X86_64_RSP, 2, $1, 2); - x86_64_fldcw_membase(inst, X86_64_RSP, 2); - /* convert float to long */ - x86_64_fistp_membase_size(inst, X86_64_RSP, 4, 8); - /* restore FPU control word */ - x86_64_fldcw_membase(inst, X86_64_RSP, 0); - /* move result to the destination */ - x86_64_mov_reg_membase_size(inst, $1, X86_64_RSP, 4, 8); - /* restore the stack */ - x86_64_add_reg_imm_size(inst, X86_64_RSP, 12, 8); +JIT_OP_FLOAT64_TO_NFLOAT: + [=freg, local] -> { + x86_64_fld_membase_size(inst, X86_64_RBP, $2, 8); + } + [=freg, xreg] -> { +#ifdef HAVE_RED_ZONE + x86_64_movsd_membase_reg(inst, X86_64_RSP, -8, $2); + x86_64_fld_membase_size(inst, X86_64_RSP, -8, 8); +#else + x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8); + x86_64_movsd_regp_reg(inst, X86_64_RSP, $2); + x86_64_fld_regp_size(inst, X86_64_RSP, 8); + x86_64_add_reg_imm_size(inst, X86_64_RSP, 8, 8); +#endif } JIT_OP_NFLOAT_TO_FLOAT32: stack [=xreg, freg] -> { +#ifdef HAVE_RED_ZONE /* Avoid modifying the stack pointer by simply using negative */ /* offsets here. */ x86_64_fstp_membase_size(inst, X86_64_RSP, -8, 4); x86_64_movss_reg_membase(inst, $1, X86_64_RSP, -8); +#else + x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8); + x86_64_fstp_regp_size(inst, X86_64_RSP, 4); + x86_64_movss_reg_regp(inst, $1, X86_64_RSP); + x86_64_add_reg_imm_size(inst, X86_64_RSP, 8, 8); +#endif } JIT_OP_NFLOAT_TO_FLOAT64: stack [=xreg, freg] -> { +#ifdef HAVE_RED_ZONE /* Avoid modifying the stack pointer by simply using negative */ /* offsets here. */ x86_64_fstp_membase_size(inst, X86_64_RSP, -8, 8); x86_64_movsd_reg_membase(inst, $1, X86_64_RSP, -8); +#else + x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8); + x86_64_fstp_regp_size(inst, X86_64_RSP, 8); + x86_64_movsd_reg_regp(inst, $1, X86_64_RSP); + x86_64_add_reg_imm_size(inst, X86_64_RSP, 8, 8); +#endif } /* @@ -894,12 +939,12 @@ JIT_OP_IDIV: more_space x86_64_cmov_reg_reg_size(inst, X86_CC_S, $1, $3, 1, 4); x86_64_sar_reg_imm_size(inst, $1, shift, 4); } - [reg("rax"), imm, scratch reg, scratch reg("rdx")] -> { + [reg("rax"), imm, scratch dreg, scratch reg("rdx")] -> { x86_64_mov_reg_imm_size(inst, $3, $2, 4); x86_64_cdq(inst); x86_64_idiv_reg_size(inst, $3, 4); } - [reg("rax"), reg, scratch reg("rdx")] -> { + [reg("rax"), dreg, scratch reg("rdx")] -> { jit_int min_int = jit_min_int; unsigned char *patch, *patch2; #ifndef JIT_USE_SIGNALS @@ -937,12 +982,12 @@ JIT_OP_IDIV_UN: more_space } x86_64_shr_reg_imm_size(inst, $1, shift, 4); } - [reg("rax"), imm, scratch reg, scratch reg("rdx")] -> { + [reg("rax"), imm, scratch dreg, scratch reg("rdx")] -> { x86_64_mov_reg_imm_size(inst, $3, $2, 4); x86_64_clear_reg(inst, X86_64_RDX); x86_64_div_reg_size(inst, $3, 4); } - [reg("rax"), reg, scratch reg("rdx")] -> { + [reg("rax"), dreg, scratch reg("rdx")] -> { #ifndef JIT_USE_SIGNALS unsigned char *patch; x86_64_test_reg_reg_size(inst, $2, $2, 4); @@ -974,12 +1019,12 @@ JIT_OP_IREM: more_space x86_patch(patch, inst); x86_64_clear_reg(inst, $1); } - [=reg("rdx"), *reg("rax"), imm, scratch reg, scratch reg("rdx")] -> { + [=reg("rdx"), *reg("rax"), imm, scratch dreg, scratch reg("rdx")] -> { x86_64_mov_reg_imm_size(inst, $4, $3, 4); x86_64_cdq(inst); x86_64_idiv_reg_size(inst, $4, 4); } - [=reg("rdx"), *reg("rax"), reg, scratch reg("rdx")] -> { + [=reg("rdx"), *reg("rax"), dreg, scratch reg("rdx")] -> { jit_int min_int = jit_min_int; unsigned char *patch, *patch2; #ifndef JIT_USE_SIGNALS @@ -1009,16 +1054,16 @@ JIT_OP_IREM_UN: more_space [reg, imm, if("$2 == 1")] -> { x86_64_clear_reg(inst, $1); } - [reg, imm, if("(((jit_nuint)$2) & (((jit_nuint)$2) - 1)) == 0")] -> { + [reg, imm, if("($2 & ($2 - 1)) == 0")] -> { /* x & (x - 1) is equal to zero if x is a power of 2 */ x86_64_and_reg_imm_size(inst, $1, $2 - 1, 4); } - [=reg("rdx"), *reg("rax"), imm, scratch reg, scratch reg("rdx")] -> { + [=reg("rdx"), *reg("rax"), imm, scratch dreg, scratch reg("rdx")] -> { x86_64_mov_reg_imm_size(inst, $4, $3, 4); x86_64_clear_reg(inst, X86_64_RDX); x86_64_div_reg_size(inst, $4, 4); } - [=reg("rdx"), *reg("rax"), reg, scratch reg("rdx")] -> { + [=reg("rdx"), *reg("rax"), dreg, scratch reg("rdx")] -> { #ifndef JIT_USE_SIGNALS unsigned char *patch; x86_64_test_reg_reg_size(inst, $3, $3, 4); @@ -1170,12 +1215,12 @@ JIT_OP_LDIV: more_space x86_64_cmov_reg_reg_size(inst, X86_CC_S, $1, $3, 1, 8); x86_64_sar_reg_imm_size(inst, $1, shift, 8); } - [reg("rax"), imm, scratch reg, scratch reg("rdx")] -> { + [reg("rax"), imm, scratch dreg, scratch reg("rdx")] -> { x86_64_mov_reg_imm_size(inst, $3, $2, 8); x86_64_cqo(inst); x86_64_idiv_reg_size(inst, $3, 8); } - [reg("rax"), reg, scratch reg("rdx")] -> { + [reg("rax"), dreg, scratch reg("rdx")] -> { jit_long min_long = jit_min_long; unsigned char *patch, *patch2; #ifndef JIT_USE_SIGNALS @@ -1214,12 +1259,12 @@ JIT_OP_LDIV_UN: more_space } x86_64_shr_reg_imm_size(inst, $1, shift, 8); } - [reg("rax"), imm, scratch reg, scratch reg("rdx")] -> { + [reg("rax"), imm, scratch dreg, scratch reg("rdx")] -> { x86_64_mov_reg_imm_size(inst, $3, $2, 8); x86_64_clear_reg(inst, X86_64_RDX); x86_64_div_reg_size(inst, $3, 8); } - [reg("rax"), reg, scratch reg("rdx")] -> { + [reg("rax"), dreg, scratch reg("rdx")] -> { #ifndef JIT_USE_SIGNALS unsigned char *patch; x86_64_test_reg_reg_size(inst, $2, $2, 8); @@ -1251,12 +1296,12 @@ JIT_OP_LREM: more_space x86_patch(patch, inst); x86_64_clear_reg(inst, $1); } - [=reg("rdx"), *reg("rax"), imm, scratch reg, scratch reg("rdx")] -> { + [=reg("rdx"), *reg("rax"), imm, scratch dreg, scratch reg("rdx")] -> { x86_64_mov_reg_imm_size(inst, $4, $3, 8); x86_64_cqo(inst); x86_64_idiv_reg_size(inst, $4, 8); } - [=reg("rdx"), *reg("rax"), reg, scratch reg("rdx")] -> { + [=reg("rdx"), *reg("rax"), dreg, scratch reg("rdx")] -> { jit_long min_long = jit_min_long; unsigned char *patch, *patch2; #ifndef JIT_USE_SIGNALS @@ -1301,12 +1346,12 @@ JIT_OP_LREM_UN: more_space x86_64_and_reg_reg_size(inst, $1, $3, 8); } } - [=reg("rdx"), *reg("rax"), imm, scratch reg, scratch reg("rdx")] -> { + [=reg("rdx"), *reg("rax"), imm, scratch dreg, scratch reg("rdx")] -> { x86_64_mov_reg_imm_size(inst, $4, $3, 8); x86_64_clear_reg(inst, X86_64_RDX); x86_64_div_reg_size(inst, $4, 8); } - [=reg("rdx"), *reg("rax"), reg, scratch reg("rdx")] -> { + [=reg("rdx"), *reg("rax"), dreg, scratch reg("rdx")] -> { #ifndef JIT_USE_SIGNALS unsigned char *patch; x86_64_test_reg_reg_size(inst, $3, $3, 8); @@ -1367,6 +1412,22 @@ JIT_OP_FDIV: x86_64_divss_reg_membase(inst, $1, X86_64_RBP, $2); } +JIT_OP_FABS: + [xreg] -> { + /* Simply clear the sign */ + jit_uint values[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; + + _jit_plops_reg_imm(gen, &inst, XMM_ANDP, $1, &(values[0])); + } + +JIT_OP_FNEG: + [xreg] -> { + /* Simply toggle the sign */ + jit_uint values[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; + + _jit_plops_reg_imm(gen, &inst, XMM_XORP, $1, &(values[0])); + } + /* * double precision float versions */ @@ -1415,6 +1476,35 @@ JIT_OP_DDIV: x86_64_divsd_reg_reg(inst, $1, $2); } +JIT_OP_DABS: + [xreg] -> { + /* Simply clear the sign */ + jit_ulong values[2] = {0x7fffffffffffffff, 0x7fffffffffffffff}; + + _jit_plopd_reg_imm(gen, &inst, XMM_ANDP, $1, &(values[0])); + } + +JIT_OP_DNEG: + [xreg] -> { + /* Simply toggle the sign */ + jit_ulong values[2] = {0x8000000000000000, 0x8000000000000000}; + + _jit_plopd_reg_imm(gen, &inst, XMM_XORP, $1, &(values[0])); + } + +/* + * native float versions + */ +JIT_OP_NFABS: stack + [freg] -> { + x86_64_fabs(inst); + } + +JIT_OP_NFNEG: stack + [freg] -> { + x86_64_fchs(inst); + } + /* * Bitwise opcodes. */ @@ -1872,13 +1962,133 @@ JIT_OP_BR_LGE_UN: branch inst = output_branch(func, inst, 0x73 /* ge_un */, insn); } +JIT_OP_BR_FEQ: + [xreg, local] -> { + x86_64_comiss_reg_membase(inst, $1, X86_64_RBP, $2); + inst = output_branch(func, inst, 0x74 /* eq */, insn); + } + [xreg, xreg] -> { + x86_64_comiss_reg_reg(inst, $1, $2); + inst = output_branch(func, inst, 0x74 /* eq */, insn); + } + +JIT_OP_BR_FNE: + [xreg, local] -> { + x86_64_comiss_reg_membase(inst, $1, X86_64_RBP, $2); + inst = output_branch(func, inst, 0x75 /* ne */, insn); + } + [xreg, xreg] -> { + x86_64_comiss_reg_reg(inst, $1, $2); + inst = output_branch(func, inst, 0x75 /* ne */, insn); + } + +JIT_OP_BR_FLT: + [xreg, local] -> { + x86_64_comiss_reg_membase(inst, $1, X86_64_RBP, $2); + inst = output_branch(func, inst, 0x72 /* lt_un */, insn); + } + [xreg, xreg] -> { + x86_64_comiss_reg_reg(inst, $1, $2); + inst = output_branch(func, inst, 0x72 /* lt_un */, insn); + } + +JIT_OP_BR_FLE: + [xreg, local] -> { + x86_64_comiss_reg_membase(inst, $1, X86_64_RBP, $2); + inst = output_branch(func, inst, 0x76 /* le_un */, insn); + } + [xreg, xreg] -> { + x86_64_comiss_reg_reg(inst, $1, $2); + inst = output_branch(func, inst, 0x76 /* le_un */, insn); + } + +JIT_OP_BR_FGT: + [xreg, local] -> { + x86_64_comiss_reg_membase(inst, $1, X86_64_RBP, $2); + inst = output_branch(func, inst, 0x77 /* gt_un */, insn); + } + [xreg, xreg] -> { + x86_64_comiss_reg_reg(inst, $1, $2); + inst = output_branch(func, inst, 0x77 /* gt_un */, insn); + } + +JIT_OP_BR_FGE: + [xreg, local] -> { + x86_64_comiss_reg_membase(inst, $1, X86_64_RBP, $2); + inst = output_branch(func, inst, 0x73 /* ge_un */, insn); + } + [xreg, xreg] -> { + x86_64_comiss_reg_reg(inst, $1, $2); + inst = output_branch(func, inst, 0x73 /* ge_un */, insn); + } + +JIT_OP_BR_DEQ: + [xreg, local] -> { + x86_64_comisd_reg_membase(inst, $1, X86_64_RBP, $2); + inst = output_branch(func, inst, 0x74 /* eq */, insn); + } + [xreg, xreg] -> { + x86_64_comisd_reg_reg(inst, $1, $2); + inst = output_branch(func, inst, 0x74 /* eq */, insn); + } + +JIT_OP_BR_DNE: + [xreg, local] -> { + x86_64_comisd_reg_membase(inst, $1, X86_64_RBP, $2); + inst = output_branch(func, inst, 0x75 /* ne */, insn); + } + [xreg, xreg] -> { + x86_64_comisd_reg_reg(inst, $1, $2); + inst = output_branch(func, inst, 0x75 /* ne */, insn); + } + +JIT_OP_BR_DLT: + [xreg, local] -> { + x86_64_comisd_reg_membase(inst, $1, X86_64_RBP, $2); + inst = output_branch(func, inst, 0x72 /* lt_un */, insn); + } + [xreg, xreg] -> { + x86_64_comisd_reg_reg(inst, $1, $2); + inst = output_branch(func, inst, 0x72 /* lt_un */, insn); + } + +JIT_OP_BR_DLE: + [xreg, local] -> { + x86_64_comisd_reg_membase(inst, $1, X86_64_RBP, $2); + inst = output_branch(func, inst, 0x76 /* le_un */, insn); + } + [xreg, xreg] -> { + x86_64_comisd_reg_reg(inst, $1, $2); + inst = output_branch(func, inst, 0x76 /* le_un */, insn); + } + +JIT_OP_BR_DGT: + [xreg, local] -> { + x86_64_comisd_reg_membase(inst, $1, X86_64_RBP, $2); + inst = output_branch(func, inst, 0x77 /* gt_un */, insn); + } + [xreg, xreg] -> { + x86_64_comisd_reg_reg(inst, $1, $2); + inst = output_branch(func, inst, 0x77 /* gt_un */, insn); + } + +JIT_OP_BR_DGE: + [xreg, local] -> { + x86_64_comisd_reg_membase(inst, $1, X86_64_RBP, $2); + inst = output_branch(func, inst, 0x73 /* ge_un */, insn); + } + [xreg, xreg] -> { + x86_64_comisd_reg_reg(inst, $1, $2); + inst = output_branch(func, inst, 0x73 /* ge_un */, insn); + } + /* * Comparison opcodes. */ JIT_OP_IEQ: [=reg, reg, immzero] -> { - x86_64_or_reg_reg_size(inst, $2, $2, 4); + x86_64_test_reg_reg_size(inst, $2, $2, 4); inst = setcc_reg(inst, $1, X86_CC_EQ, 0); } [=reg, reg, imm] -> { @@ -1896,7 +2106,7 @@ JIT_OP_IEQ: JIT_OP_INE: [=reg, reg, immzero] -> { - x86_64_or_reg_reg_size(inst, $2, $2, 4); + x86_64_test_reg_reg_size(inst, $2, $2, 4); inst = setcc_reg(inst, $1, X86_CC_NE, 0); } [=reg, reg, imm] -> { @@ -2026,7 +2236,7 @@ JIT_OP_IGE_UN: JIT_OP_LEQ: [=reg, reg, immzero] -> { - x86_64_or_reg_reg_size(inst, $2, $2, 8); + x86_64_test_reg_reg_size(inst, $2, $2, 8); inst = setcc_reg(inst, $1, X86_CC_EQ, 0); } [=reg, reg, imm, if("$3 >= (jit_nint)jit_min_int && $3 <= (jit_nint)jit_max_int")] -> { @@ -2044,7 +2254,7 @@ JIT_OP_LEQ: JIT_OP_LNE: [=reg, reg, immzero] -> { - x86_64_or_reg_reg_size(inst, $2, $2, 8); + x86_64_test_reg_reg_size(inst, $2, $2, 8); inst = setcc_reg(inst, $1, X86_CC_NE, 0); } [=reg, reg, imm, if("$3 >= (jit_nint)jit_min_int && $3 <= (jit_nint)jit_max_int")] -> { @@ -2172,6 +2382,232 @@ JIT_OP_LGE_UN: inst = setcc_reg(inst, $1, X86_CC_GE, 0); } +JIT_OP_FEQ: + [=reg, xreg, xreg] -> { + x86_64_comiss_reg_reg(inst, $2, $3); + inst = setcc_reg(inst, $1, X86_CC_EQ, 0); + } + +JIT_OP_FNE: + [=reg, xreg, xreg] -> { + x86_64_comiss_reg_reg(inst, $2, $3); + inst = setcc_reg(inst, $1, X86_CC_NE, 0); + } + +JIT_OP_FLT: + [=reg, xreg, xreg] -> { + x86_64_comiss_reg_reg(inst, $2, $3); + inst = setcc_reg(inst, $1, X86_CC_B, 0); + } + +JIT_OP_FLE: + [=reg, xreg, xreg] -> { + x86_64_comiss_reg_reg(inst, $2, $3); + inst = setcc_reg(inst, $1, X86_CC_BE, 0); + } + +JIT_OP_FGT: + [=reg, xreg, xreg] -> { + x86_64_comiss_reg_reg(inst, $2, $3); + inst = setcc_reg(inst, $1, X86_CC_A, 0); + } + +JIT_OP_FGE: + [=reg, xreg, xreg] -> { + x86_64_comiss_reg_reg(inst, $2, $3); + inst = setcc_reg(inst, $1, X86_CC_AE, 0); + } + +JIT_OP_DEQ: + [=reg, xreg, xreg] -> { + x86_64_comisd_reg_reg(inst, $2, $3); + inst = setcc_reg(inst, $1, X86_CC_EQ, 0); + } + +JIT_OP_DNE: + [=reg, xreg, xreg] -> { + x86_64_comisd_reg_reg(inst, $2, $3); + inst = setcc_reg(inst, $1, X86_CC_NE, 0); + } + +JIT_OP_DLT: + [=reg, xreg, xreg] -> { + x86_64_comisd_reg_reg(inst, $2, $3); + inst = setcc_reg(inst, $1, X86_CC_B, 0); + } + +JIT_OP_DLE: + [=reg, xreg, xreg] -> { + x86_64_comisd_reg_reg(inst, $2, $3); + inst = setcc_reg(inst, $1, X86_CC_BE, 0); + } + +JIT_OP_DGT: + [=reg, xreg, xreg] -> { + x86_64_comisd_reg_reg(inst, $2, $3); + inst = setcc_reg(inst, $1, X86_CC_A, 0); + } + +JIT_OP_DGE: + [=reg, xreg, xreg] -> { + x86_64_comisd_reg_reg(inst, $2, $3); + inst = setcc_reg(inst, $1, X86_CC_AE, 0); + } + +JIT_OP_FSQRT: + [=xreg, local] -> { + x86_64_sqrtss_reg_membase(inst, $1, X86_64_RBP, $2); + } + [=xreg, xreg] -> { + x86_64_sqrtss_reg_reg(inst, $1, $2); + } + +JIT_OP_DSQRT: + [=xreg, local] -> { + x86_64_sqrtsd_reg_membase(inst, $1, X86_64_RBP, $2); + } + [=xreg, xreg] -> { + x86_64_sqrtsd_reg_reg(inst, $1, $2); + } + +/* + * Absolute, minimum, maximum, and sign. + */ +JIT_OP_IMAX: + [reg, reg] -> { + x86_64_cmp_reg_reg_size(inst, $1, $2, 4); + x86_64_cmov_reg_reg_size(inst, X86_CC_LT, $1, $2, 1, 4); + } + +JIT_OP_IMAX_UN: + [reg, reg] -> { + x86_64_cmp_reg_reg_size(inst, $1, $2, 4); + x86_64_cmov_reg_reg_size(inst, X86_CC_LT, $1, $2, 0, 4); + } + +JIT_OP_IMIN: + [reg, reg] -> { + x86_64_cmp_reg_reg_size(inst, $1, $2, 4); + x86_64_cmov_reg_reg_size(inst, X86_CC_GT, $1, $2, 1, 4); + } + +JIT_OP_IMIN_UN: + [reg, reg] -> { + x86_64_cmp_reg_reg_size(inst, $1, $2, 4); + x86_64_cmov_reg_reg_size(inst, X86_CC_GT, $1, $2, 0, 4); + } + +JIT_OP_LMAX: + [reg, reg] -> { + x86_64_cmp_reg_reg_size(inst, $1, $2, 8); + x86_64_cmov_reg_reg_size(inst, X86_CC_LT, $1, $2, 1, 8); + } + +JIT_OP_LMAX_UN: + [reg, reg] -> { + x86_64_cmp_reg_reg_size(inst, $1, $2, 8); + x86_64_cmov_reg_reg_size(inst, X86_CC_LT, $1, $2, 0, 8); + } + +JIT_OP_LMIN: + [reg, reg] -> { + x86_64_cmp_reg_reg_size(inst, $1, $2, 8); + x86_64_cmov_reg_reg_size(inst, X86_CC_GT, $1, $2, 1, 8); + } + +JIT_OP_LMIN_UN: + [reg, reg] -> { + x86_64_cmp_reg_reg_size(inst, $1, $2, 8); + x86_64_cmov_reg_reg_size(inst, X86_CC_GT, $1, $2, 0, 8); + } + +JIT_OP_FMAX: + [xreg, local] -> { + x86_64_maxss_reg_membase(inst, $1, X86_64_RBP, $2); + } + [xreg, xreg] -> { + x86_64_maxss_reg_reg(inst, $1, $2); + } + +JIT_OP_FMIN: + [xreg, local] -> { + x86_64_minss_reg_membase(inst, $1, X86_64_RBP, $2); + } + [xreg, xreg] -> { + x86_64_minss_reg_reg(inst, $1, $2); + } + +JIT_OP_DMAX: + [xreg, local] -> { + x86_64_maxsd_reg_membase(inst, $1, X86_64_RBP, $2); + } + [xreg, xreg] -> { + x86_64_maxsd_reg_reg(inst, $1, $2); + } + +JIT_OP_DMIN: + [xreg, local] -> { + x86_64_minsd_reg_membase(inst, $1, X86_64_RBP, $2); + } + [xreg, xreg] -> { + x86_64_minsd_reg_reg(inst, $1, $2); + } + +/* + * Rounding + */ +JIT_OP_FFLOOR: more_space + [=xreg, local, scratch reg] -> { + inst = x86_64_rounds_reg_membase(inst, $1, $2, $3, X86_ROUND_DOWN); + } + [=xreg, xreg, scratch reg] -> { + inst = x86_64_rounds_reg_reg(inst, $1, $2, $3, X86_ROUND_DOWN); + } + +JIT_OP_DFLOOR: more_space + [=xreg, local, scratch reg] -> { + inst = x86_64_roundd_reg_membase(inst, $1, $2, $3, X86_ROUND_DOWN); + } + [=xreg, xreg, scratch reg] -> { + inst = x86_64_roundd_reg_reg(inst, $1, $2, $3, X86_ROUND_DOWN); + } + +JIT_OP_NFFLOOR: more_space + [freg, scratch reg] -> { + inst = x86_64_roundnf(inst, $2, X86_ROUND_DOWN); + } + +JIT_OP_FCEIL: more_space + [=xreg, local, scratch reg] -> { + inst = x86_64_rounds_reg_membase(inst, $1, $2, $3, X86_ROUND_UP); + } + [=xreg, xreg, scratch reg] -> { + inst = x86_64_rounds_reg_reg(inst, $1, $2, $3, X86_ROUND_UP); + } + +JIT_OP_DCEIL: more_space + [=xreg, local, scratch reg] -> { + inst = x86_64_roundd_reg_membase(inst, $1, $2, $3, X86_ROUND_UP); + } + [=xreg, xreg, scratch reg] -> { + inst = x86_64_roundd_reg_reg(inst, $1, $2, $3, X86_ROUND_UP); + } + +JIT_OP_NFCEIL: more_space + [freg, scratch reg] -> { + inst = x86_64_roundnf(inst, $2, X86_ROUND_UP); + } + +/* +JIT_OP_FRINT: more_space + [=xreg, local, scratch reg] -> { + inst = x86_64_rounds_reg_membase(inst, $1, $2, $3, X86_ROUND_ZERO); + } + [=xreg, xreg, scratch reg] -> { + inst = x86_64_rounds_reg_reg(inst, $1, $2, $3, X86_ROUND_ZERO); + } +*/ + /* * Pointer check opcodes. */ -- 2.47.3