From 6c2aafbdf7f67261fe6a8eb74522c6a0f4a7e8bb Mon Sep 17 00:00:00 2001 From: Frode Solheim Date: Sat, 5 Sep 2015 17:59:56 +0200 Subject: [PATCH] JIT: renamed compemu_raw_x86.cpp -> codegen_x86.cpp --- jit/codegen_x86.cpp | 3464 +++++++++++++++++++++++++++++++++++++++ jit/compemu_support.cpp | 2 +- 2 files changed, 3465 insertions(+), 1 deletion(-) create mode 100644 jit/codegen_x86.cpp diff --git a/jit/codegen_x86.cpp b/jit/codegen_x86.cpp new file mode 100644 index 00000000..41835f7b --- /dev/null +++ b/jit/codegen_x86.cpp @@ -0,0 +1,3464 @@ +/* This should eventually end up in machdep/, but for now, x86 is the +only target, and it's easier this way... */ + +/************************************************************************* +* Some basic information about the the target CPU * +*************************************************************************/ + +#define EAX_INDEX 0 +#define ECX_INDEX 1 +#define EDX_INDEX 2 +#define EBX_INDEX 3 +#define ESP_INDEX 4 +#define EBP_INDEX 5 +#define ESI_INDEX 6 +#define EDI_INDEX 7 +#if defined(__x86_64__) +#define R8_INDEX 8 +#define R9_INDEX 9 +#define R10_INDEX 10 +#define R11_INDEX 11 +#define R12_INDEX 12 +#define R13_INDEX 13 +#define R14_INDEX 14 +#define R15_INDEX 15 +#endif +/* XXX this has to match X86_Reg8H_Base + 4 */ +#define AH_INDEX (0x10+4+EAX_INDEX) +#define CH_INDEX (0x10+4+ECX_INDEX) +#define DH_INDEX (0x10+4+EDX_INDEX) +#define BH_INDEX (0x10+4+EBX_INDEX) + +/* The register in which subroutines return an integer return value */ +#define REG_RESULT EAX_INDEX + +/* The registers subroutines take their first and second argument in */ +#if defined(_WIN32) +/* Handle the _fastcall parameters of ECX and EDX */ +#define REG_PAR1 ECX_INDEX +#define REG_PAR2 EDX_INDEX +#elif defined(__x86_64__) +#define REG_PAR1 EDI_INDEX +#define REG_PAR2 ESI_INDEX +#else +#define REG_PAR1 EAX_INDEX +#define REG_PAR2 EDX_INDEX +#endif + +#if defined(_WIN32) +#define REG_PC_PRE EAX_INDEX /* The register we use for preloading regs.pc_p */ +#define REG_PC_TMP ECX_INDEX +#define SHIFTCOUNT_NREG ECX_INDEX /* Register that can be used for shiftcount. -1 if any reg will do */ +#else +#define REG_PC_PRE EAX_INDEX /* The register we use for preloading regs.pc_p */ +#define REG_PC_TMP ECX_INDEX /* Another register that is not the above */ +#define SHIFTCOUNT_NREG ECX_INDEX /* Register that can be used for shiftcount. -1 if any reg will do */ +#endif + +#define MUL_NREG1 EAX_INDEX /* %eax will hold the low 32 bits after a 32x32 mul */ +#define MUL_NREG2 EDX_INDEX /* %edx will hold the high 32 bits */ + +#define STACK_ALIGN 16 +#define STACK_OFFSET sizeof(void *) + +uae_u8 always_used[]={4,0xff}; +#if defined(__x86_64__) +uae_s8 can_byte[]={0,1,2,3,5,6,7,8,9,10,11,12,13,14,15,-1}; +uae_s8 can_word[]={0,1,2,3,5,6,7,8,9,10,11,12,13,14,15,-1}; +#else +uae_u8 can_byte[]={0,1,2,3,0xff}; +uae_u8 can_word[]={0,1,2,3,5,6,7,0xff}; +#endif + +uae_u8 call_saved[]={0,0,0,0,1,0,0,0}; + +/* This *should* be the same as call_saved. But: +- We might not really know which registers are saved, and which aren't, +so we need to preserve some, but don't want to rely on everyone else +also saving those registers +- Special registers (such like the stack pointer) should not be "preserved" +by pushing, even though they are "saved" across function calls +*/ +uae_u8 need_to_preserve[]={1,1,1,1,0,1,1,1}; + +/* Whether classes of instructions do or don't clobber the native flags */ +#define CLOBBER_MOV +#define CLOBBER_LEA +#define CLOBBER_CMOV +#define CLOBBER_POP +#define CLOBBER_PUSH +#define CLOBBER_SUB clobber_flags() +#define CLOBBER_SBB clobber_flags() +#define CLOBBER_CMP clobber_flags() +#define CLOBBER_ADD clobber_flags() +#define CLOBBER_ADC clobber_flags() +#define CLOBBER_AND clobber_flags() +#define CLOBBER_OR clobber_flags() +#define CLOBBER_XOR clobber_flags() + +#define CLOBBER_ROL clobber_flags() +#define CLOBBER_ROR clobber_flags() +#define CLOBBER_SHLL clobber_flags() +#define CLOBBER_SHRL clobber_flags() +#define CLOBBER_SHRA clobber_flags() +#define CLOBBER_TEST clobber_flags() +#define CLOBBER_CL16 +#define CLOBBER_CL8 +#define CLOBBER_SE16 +#define CLOBBER_SE8 +#define CLOBBER_ZE16 +#define CLOBBER_ZE8 +#define CLOBBER_SW16 clobber_flags() +#define CLOBBER_SW32 +#define CLOBBER_SETCC +#define CLOBBER_MUL clobber_flags() +#define CLOBBER_BT clobber_flags() +#define CLOBBER_BSF clobber_flags() + +/************************************************************************* +* Actual encoding of the instructions on the target CPU * +*************************************************************************/ + +//#include "compemu_optimizer_x86.c" + +STATIC_INLINE uae_u16 swap16(uae_u16 x) +{ + return ((x&0xff00)>>8)|((x&0x00ff)<<8); +} + +STATIC_INLINE uae_u32 swap32(uae_u32 x) +{ + return ((x&0xff00)<<8)|((x&0x00ff)<<24)|((x&0xff0000)>>8)|((x&0xff000000)>>24); +} + +STATIC_INLINE int isbyte(uae_s32 x) +{ + return (x>=-128 && x<=127); +} + +LOWFUNC(NONE,WRITE,1,raw_push_l_r,(R4 r)) +{ + emit_byte(0x50+r); +} +LENDFUNC(NONE,WRITE,1,raw_push_l_r,(R4 r)) + + LOWFUNC(NONE,READ,1,raw_pop_l_r,(R4 r)) +{ + emit_byte(0x58+r); +} +LENDFUNC(NONE,READ,1,raw_pop_l_r,(R4 r)) + + LOWFUNC(WRITE,NONE,2,raw_bt_l_ri,(R4 r, IMM i)) +{ + emit_byte(0x0f); + emit_byte(0xba); + emit_byte(0xe0+r); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_bt_l_ri,(R4 r, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_bt_l_rr,(R4 r, R4 b)) +{ + emit_byte(0x0f); + emit_byte(0xa3); + emit_byte(0xc0+8*b+r); +} +LENDFUNC(WRITE,NONE,2,raw_bt_l_rr,(R4 r, R4 b)) + + LOWFUNC(WRITE,NONE,2,raw_btc_l_ri,(RW4 r, IMM i)) +{ + emit_byte(0x0f); + emit_byte(0xba); + emit_byte(0xf8+r); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_btc_l_ri,(RW4 r, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_btc_l_rr,(RW4 r, R4 b)) +{ + emit_byte(0x0f); + emit_byte(0xbb); + emit_byte(0xc0+8*b+r); +} +LENDFUNC(WRITE,NONE,2,raw_btc_l_rr,(RW4 r, R4 b)) + + + LOWFUNC(WRITE,NONE,2,raw_btr_l_ri,(RW4 r, IMM i)) +{ + emit_byte(0x0f); + emit_byte(0xba); + emit_byte(0xf0+r); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_btr_l_ri,(RW4 r, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_btr_l_rr,(RW4 r, R4 b)) +{ + emit_byte(0x0f); + emit_byte(0xb3); + emit_byte(0xc0+8*b+r); +} +LENDFUNC(WRITE,NONE,2,raw_btr_l_rr,(RW4 r, R4 b)) + + LOWFUNC(WRITE,NONE,2,raw_bts_l_ri,(RW4 r, IMM i)) +{ + emit_byte(0x0f); + emit_byte(0xba); + emit_byte(0xe8+r); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_bts_l_ri,(RW4 r, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_bts_l_rr,(RW4 r, R4 b)) +{ + emit_byte(0x0f); + emit_byte(0xab); + emit_byte(0xc0+8*b+r); +} +LENDFUNC(WRITE,NONE,2,raw_bts_l_rr,(RW4 r, R4 b)) + + LOWFUNC(WRITE,NONE,2,raw_sub_w_ri,(RW2 d, IMM i)) +{ + emit_byte(0x66); + if (isbyte(i)) { + emit_byte(0x83); + emit_byte(0xe8+d); + emit_byte(i); + } + else { + emit_byte(0x81); + emit_byte(0xe8+d); + emit_word(i); + } +} +LENDFUNC(WRITE,NONE,2,raw_sub_w_ri,(RW2 d, IMM i)) + + + LOWFUNC(NONE,WRITE,2,raw_mov_l_mi,(MEMW d, IMM s)) +{ + emit_byte(0xc7); + emit_byte(0x05); + emit_long(d); + emit_long(s); +} +LENDFUNC(NONE,WRITE,2,raw_mov_l_mi,(MEMW d, IMM s)) + + LOWFUNC(NONE,WRITE,2,raw_mov_w_mi,(MEMW d, IMM s)) +{ + emit_byte(0x66); + emit_byte(0xc7); + emit_byte(0x05); + emit_long(d); + emit_word(s); +} +LENDFUNC(NONE,WRITE,2,raw_mov_w_mi,(MEMW d, IMM s)) + + LOWFUNC(NONE,WRITE,2,raw_mov_b_mi,(MEMW d, IMM s)) +{ + emit_byte(0xc6); + emit_byte(0x05); + emit_long(d); + emit_byte(s); +} +LENDFUNC(NONE,WRITE,2,raw_mov_b_mi,(MEMW d, IMM s)) + + LOWFUNC(WRITE,RMW,2,raw_rol_b_mi,(MEMRW d, IMM i)) +{ + emit_byte(0xc0); + emit_byte(0x05); + emit_long(d); + emit_byte(i); +} +LENDFUNC(WRITE,RMW,2,raw_rol_b_mi,(MEMRW d, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_rol_b_ri,(RW1 r, IMM i)) +{ + emit_byte(0xc0); + emit_byte(0xc0+r); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_rol_b_ri,(RW1 r, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_rol_w_ri,(RW2 r, IMM i)) +{ + emit_byte(0x66); + emit_byte(0xc1); + emit_byte(0xc0+r); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_rol_w_ri,(RW2 r, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_rol_l_ri,(RW4 r, IMM i)) +{ + emit_byte(0xc1); + emit_byte(0xc0+r); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_rol_l_ri,(RW4 r, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_rol_l_rr,(RW4 d, R1 r)) +{ + emit_byte(0xd3); + emit_byte(0xc0+d); +} +LENDFUNC(WRITE,NONE,2,raw_rol_l_rr,(RW4 d, R1 r)) + + LOWFUNC(WRITE,NONE,2,raw_rol_w_rr,(RW2 d, R1 r)) +{ + emit_byte(0x66); + emit_byte(0xd3); + emit_byte(0xc0+d); +} +LENDFUNC(WRITE,NONE,2,raw_rol_w_rr,(RW2 d, R1 r)) + + LOWFUNC(WRITE,NONE,2,raw_rol_b_rr,(RW1 d, R1 r)) +{ + emit_byte(0xd2); + emit_byte(0xc0+d); +} +LENDFUNC(WRITE,NONE,2,raw_rol_b_rr,(RW1 d, R1 r)) + + LOWFUNC(WRITE,NONE,2,raw_shll_l_rr,(RW4 d, R1 r)) +{ + emit_byte(0xd3); + emit_byte(0xe0+d); +} +LENDFUNC(WRITE,NONE,2,raw_shll_l_rr,(RW4 d, R1 r)) + + LOWFUNC(WRITE,NONE,2,raw_shll_w_rr,(RW2 d, R1 r)) +{ + emit_byte(0x66); + emit_byte(0xd3); + emit_byte(0xe0+d); +} +LENDFUNC(WRITE,NONE,2,raw_shll_w_rr,(RW2 d, R1 r)) + + LOWFUNC(WRITE,NONE,2,raw_shll_b_rr,(RW1 d, R1 r)) +{ + emit_byte(0xd2); + emit_byte(0xe0+d); +} +LENDFUNC(WRITE,NONE,2,raw_shll_b_rr,(RW1 d, R1 r)) + + LOWFUNC(WRITE,NONE,2,raw_ror_b_ri,(RW1 r, IMM i)) +{ + emit_byte(0xc0); + emit_byte(0xc8+r); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_ror_b_ri,(RW1 r, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_ror_w_ri,(RW2 r, IMM i)) +{ + emit_byte(0x66); + emit_byte(0xc1); + emit_byte(0xc8+r); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_ror_w_ri,(RW2 r, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_ror_l_ri,(RW4 r, IMM i)) +{ + emit_byte(0xc1); + emit_byte(0xc8+r); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_ror_l_ri,(RW4 r, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_ror_l_rr,(RW4 d, R1 r)) +{ + emit_byte(0xd3); + emit_byte(0xc8+d); +} +LENDFUNC(WRITE,NONE,2,raw_ror_l_rr,(RW4 d, R1 r)) + + LOWFUNC(WRITE,NONE,2,raw_ror_w_rr,(RW2 d, R1 r)) +{ + emit_byte(0x66); + emit_byte(0xd3); + emit_byte(0xc8+d); +} +LENDFUNC(WRITE,NONE,2,raw_ror_w_rr,(RW2 d, R1 r)) + + LOWFUNC(WRITE,NONE,2,raw_ror_b_rr,(RW1 d, R1 r)) +{ + emit_byte(0xd2); + emit_byte(0xc8+d); +} +LENDFUNC(WRITE,NONE,2,raw_ror_b_rr,(RW1 d, R1 r)) + + LOWFUNC(WRITE,NONE,2,raw_shrl_l_rr,(RW4 d, R1 r)) +{ + emit_byte(0xd3); + emit_byte(0xe8+d); +} +LENDFUNC(WRITE,NONE,2,raw_shrl_l_rr,(RW4 d, R1 r)) + + LOWFUNC(WRITE,NONE,2,raw_shrl_w_rr,(RW2 d, R1 r)) +{ + emit_byte(0x66); + emit_byte(0xd3); + emit_byte(0xe8+d); +} +LENDFUNC(WRITE,NONE,2,raw_shrl_w_rr,(RW2 d, R1 r)) + + LOWFUNC(WRITE,NONE,2,raw_shrl_b_rr,(RW1 d, R1 r)) +{ + emit_byte(0xd2); + emit_byte(0xe8+d); +} +LENDFUNC(WRITE,NONE,2,raw_shrl_b_rr,(RW1 d, R1 r)) + + LOWFUNC(WRITE,NONE,2,raw_shra_l_rr,(RW4 d, R1 r)) +{ + emit_byte(0xd3); + emit_byte(0xf8+d); +} +LENDFUNC(WRITE,NONE,2,raw_shra_l_rr,(RW4 d, R1 r)) + + LOWFUNC(WRITE,NONE,2,raw_shra_w_rr,(RW2 d, R1 r)) +{ + emit_byte(0x66); + emit_byte(0xd3); + emit_byte(0xf8+d); +} +LENDFUNC(WRITE,NONE,2,raw_shra_w_rr,(RW2 d, R1 r)) + + LOWFUNC(WRITE,NONE,2,raw_shra_b_rr,(RW1 d, R1 r)) +{ + emit_byte(0xd2); + emit_byte(0xf8+d); +} +LENDFUNC(WRITE,NONE,2,raw_shra_b_rr,(RW1 d, R1 r)) + + LOWFUNC(WRITE,NONE,2,raw_shll_l_ri,(RW4 r, IMM i)) +{ + emit_byte(0xc1); + emit_byte(0xe0+r); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_shll_l_ri,(RW4 r, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_shll_w_ri,(RW2 r, IMM i)) +{ + emit_byte(0x66); + emit_byte(0xc1); + emit_byte(0xe0+r); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_shll_w_ri,(RW2 r, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_shll_b_ri,(RW1 r, IMM i)) +{ + emit_byte(0xc0); + emit_byte(0xe0+r); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_shll_b_ri,(RW1 r, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_shrl_l_ri,(RW4 r, IMM i)) +{ + emit_byte(0xc1); + emit_byte(0xe8+r); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_shrl_l_ri,(RW4 r, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_shrl_w_ri,(RW2 r, IMM i)) +{ + emit_byte(0x66); + emit_byte(0xc1); + emit_byte(0xe8+r); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_shrl_w_ri,(RW2 r, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_shrl_b_ri,(RW1 r, IMM i)) +{ + emit_byte(0xc0); + emit_byte(0xe8+r); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_shrl_b_ri,(RW1 r, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_shra_l_ri,(RW4 r, IMM i)) +{ + emit_byte(0xc1); + emit_byte(0xf8+r); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_shra_l_ri,(RW4 r, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_shra_w_ri,(RW2 r, IMM i)) +{ + emit_byte(0x66); + emit_byte(0xc1); + emit_byte(0xf8+r); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_shra_w_ri,(RW2 r, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_shra_b_ri,(RW1 r, IMM i)) +{ + emit_byte(0xc0); + emit_byte(0xf8+r); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_shra_b_ri,(RW1 r, IMM i)) + + LOWFUNC(WRITE,NONE,1,raw_sahf,(R2 dummy_ah)) +{ + emit_byte(0x9e); +} +LENDFUNC(WRITE,NONE,1,raw_sahf,(R2 dummy_ah)) + + LOWFUNC(NONE,NONE,1,raw_cpuid,(R4 dummy_eax)) +{ + emit_byte(0x0f); + emit_byte(0xa2); +} +LENDFUNC(NONE,NONE,1,raw_cpuid,(R4 dummy_eax)) + + LOWFUNC(READ,NONE,1,raw_lahf,(W2 dummy_ah)) +{ + emit_byte(0x9f); +} +LENDFUNC(READ,NONE,1,raw_lahf,(W2 dummy_ah)) + + LOWFUNC(READ,NONE,2,raw_setcc,(W1 d, IMM cc)) +{ + emit_byte(0x0f); + emit_byte(0x90+cc); + emit_byte(0xc0+d); +} +LENDFUNC(READ,NONE,2,raw_setcc,(W1 d, IMM cc)) + + LOWFUNC(READ,WRITE,2,raw_setcc_m,(MEMW d, IMM cc)) +{ + emit_byte(0x0f); + emit_byte(0x90+cc); + emit_byte(0x05); + emit_long(d); +} +LENDFUNC(READ,WRITE,2,raw_setcc_m,(MEMW d, IMM cc)) + + LOWFUNC(READ,NONE,3,raw_cmov_b_rr,(RW1 d, R1 s, IMM cc)) +{ + /* replacement using branch and mov */ + int uncc=(cc^1); + emit_byte(0x70+uncc); + emit_byte(3); /* skip next 2 bytes if not cc=true */ + emit_byte(0x88); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(READ,NONE,3,raw_cmov_b_rr,(RW1 d, R1 s, IMM cc)) + + LOWFUNC(READ,NONE,3,raw_cmov_w_rr,(RW2 d, R2 s, IMM cc)) +{ + if (have_cmov) { + emit_byte(0x66); + emit_byte(0x0f); + emit_byte(0x40+cc); + emit_byte(0xc0+8*d+s); + } + else { /* replacement using branch and mov */ + int uncc=(cc^1); + emit_byte(0x70+uncc); + emit_byte(3); /* skip next 3 bytes if not cc=true */ + emit_byte(0x66); + emit_byte(0x89); + emit_byte(0xc0+8*s+d); + } +} +LENDFUNC(READ,NONE,3,raw_cmov_w_rr,(RW2 d, R2 s, IMM cc)) + + LOWFUNC(READ,NONE,3,raw_cmov_l_rr,(RW4 d, R4 s, IMM cc)) +{ + if (have_cmov) { + emit_byte(0x0f); + emit_byte(0x40+cc); + emit_byte(0xc0+8*d+s); + } + else { /* replacement using branch and mov */ + int uncc=(cc^1); + emit_byte(0x70+uncc); + emit_byte(2); /* skip next 2 bytes if not cc=true */ + emit_byte(0x89); + emit_byte(0xc0+8*s+d); + } +} +LENDFUNC(READ,NONE,3,raw_cmov_l_rr,(RW4 d, R4 s, IMM cc)) + + LOWFUNC(WRITE,NONE,2,raw_bsf_l_rr,(W4 d, R4 s)) +{ + emit_byte(0x0f); + emit_byte(0xbc); + emit_byte(0xc0+8*d+s); +} +LENDFUNC(WRITE,NONE,2,raw_bsf_l_rr,(W4 d, R4 s)) + + LOWFUNC(NONE,NONE,2,raw_sign_extend_16_rr,(W4 d, R2 s)) +{ + emit_byte(0x0f); + emit_byte(0xbf); + emit_byte(0xc0+8*d+s); +} +LENDFUNC(NONE,NONE,2,raw_sign_extend_16_rr,(W4 d, R2 s)) + + LOWFUNC(NONE,NONE,2,raw_sign_extend_8_rr,(W4 d, R1 s)) +{ + emit_byte(0x0f); + emit_byte(0xbe); + emit_byte(0xc0+8*d+s); +} +LENDFUNC(NONE,NONE,2,raw_sign_extend_8_rr,(W4 d, R1 s)) + + LOWFUNC(NONE,NONE,2,raw_zero_extend_16_rr,(W4 d, R2 s)) +{ + emit_byte(0x0f); + emit_byte(0xb7); + emit_byte(0xc0+8*d+s); +} +LENDFUNC(NONE,NONE,2,raw_zero_extend_16_rr,(W4 d, R2 s)) + + LOWFUNC(NONE,NONE,2,raw_zero_extend_8_rr,(W4 d, R1 s)) +{ + emit_byte(0x0f); + emit_byte(0xb6); + emit_byte(0xc0+8*d+s); +} +LENDFUNC(NONE,NONE,2,raw_zero_extend_8_rr,(W4 d, R1 s)) + + LOWFUNC(NONE,NONE,2,raw_imul_32_32,(RW4 d, R4 s)) +{ + emit_byte(0x0f); + emit_byte(0xaf); + emit_byte(0xc0+8*d+s); +} +LENDFUNC(NONE,NONE,2,raw_imul_32_32,(RW4 d, R4 s)) + + LOWFUNC(NONE,NONE,2,raw_imul_64_32,(RW4 d, RW4 s)) +{ +#ifdef JIT_DEBUG + if (d!=MUL_NREG1 || s!=MUL_NREG2) { + write_log (_T("JIT: Bad register in IMUL: d=%d, s=%d\n"),d,s); + abort(); + } +#endif + emit_byte(0xf7); + emit_byte(0xea); +} +LENDFUNC(NONE,NONE,2,raw_imul_64_32,(RW4 d, RW4 s)) + + LOWFUNC(NONE,NONE,2,raw_mul_64_32,(RW4 d, RW4 s)) +{ +#ifdef JIT_DEBUG + if (d!=MUL_NREG1 || s!=MUL_NREG2) { + write_log (_T("JIT: Bad register in MUL: d=%d, s=%d\n"),d,s); + abort(); + } +#endif + emit_byte(0xf7); + emit_byte(0xe2); +} +LENDFUNC(NONE,NONE,2,raw_mul_64_32,(RW4 d, RW4 s)) + + LOWFUNC(NONE,NONE,2,raw_mov_b_rr,(W1 d, R1 s)) +{ + emit_byte(0x88); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(NONE,NONE,2,raw_mov_b_rr,(W1 d, R1 s)) + + LOWFUNC(NONE,NONE,2,raw_mov_w_rr,(W2 d, R2 s)) +{ + emit_byte(0x66); + emit_byte(0x89); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(NONE,NONE,2,raw_mov_w_rr,(W2 d, R2 s)) + + LOWFUNC(NONE,READ,3,raw_mov_l_rrm_indexed,(W4 d, R4 baser, R4 index)) +{ + emit_byte(0x8b); + if (baser==5) { + emit_byte(0x44+8*d); + emit_byte(8*index+baser); + emit_byte(0); + return; + } + emit_byte(0x04+8*d); + emit_byte(8*index+baser); +} +LENDFUNC(NONE,READ,3,raw_mov_l_rrm_indexed,(W4 d, R4 baser, R4 index)) + + LOWFUNC(NONE,READ,3,raw_mov_w_rrm_indexed,(W2 d, R4 baser, R4 index)) +{ + emit_byte(0x66); + emit_byte(0x8b); + if (baser==5) { + emit_byte(0x44+8*d); + emit_byte(8*index+baser); + emit_byte(0); + return; + } + emit_byte(0x04+8*d); + emit_byte(8*index+baser); +} +LENDFUNC(NONE,READ,3,raw_mov_w_rrm_indexed,(W2 d, R4 baser, R4 index)) + + LOWFUNC(NONE,READ,3,raw_mov_b_rrm_indexed,(W1 d, R4 baser, R4 index)) +{ + emit_byte(0x8a); + if (baser==5) { + emit_byte(0x44+8*d); + emit_byte(8*index+baser); + emit_byte(0); + return; + } + emit_byte(0x04+8*d); + emit_byte(8*index+baser); +} +LENDFUNC(NONE,READ,3,raw_mov_b_rrm_indexed,(W1 d, R4 baser, R4 index)) + + LOWFUNC(NONE,WRITE,3,raw_mov_l_mrr_indexed,(R4 baser, R4 index, R4 s)) +{ + emit_byte(0x89); + if (baser==5) { + emit_byte(0x44+8*s); + emit_byte(8*index+baser); + emit_byte(0); + return; + } + emit_byte(0x04+8*s); + emit_byte(8*index+baser); +} +LENDFUNC(NONE,WRITE,3,raw_mov_l_mrr_indexed,(R4 baser, R4 index, R4 s)) + + LOWFUNC(NONE,WRITE,3,raw_mov_w_mrr_indexed,(R4 baser, R4 index, R2 s)) +{ + emit_byte(0x66); + emit_byte(0x89); + if (baser==5) { + emit_byte(0x44+8*s); + emit_byte(8*index+baser); + emit_byte(0); + return; + } + emit_byte(0x04+8*s); + emit_byte(8*index+baser); +} +LENDFUNC(NONE,WRITE,3,raw_mov_w_mrr_indexed,(R4 baser, R4 index, R2 s)) + + LOWFUNC(NONE,WRITE,3,raw_mov_b_mrr_indexed,(R4 baser, R4 index, R1 s)) +{ + emit_byte(0x88); + if (baser==5) { + emit_byte(0x44+8*s); + emit_byte(8*index+baser); + emit_byte(0); + return; + } + emit_byte(0x04+8*s); + emit_byte(8*index+baser); +} +LENDFUNC(NONE,WRITE,3,raw_mov_b_mrr_indexed,(R4 baser, R4 index, R1 s)) + + LOWFUNC(NONE,READ,3,raw_mov_l_rm_indexed,(W4 d, IMM base, R4 index)) +{ + emit_byte(0x8b); + emit_byte(0x04+8*d); + emit_byte(0x85+8*index); + emit_long(base); +} +LENDFUNC(NONE,READ,3,raw_mov_l_rm_indexed,(W4 d, IMM base, R4 index)) + + LOWFUNC(NONE,READ,4,raw_cmov_l_rm_indexed,(W4 d, IMM base, R4 index, IMM cond)) +{ + if (have_cmov) { + emit_byte(0x0f); + emit_byte(0x40+cond); + } + else { /* replacement using branch and mov */ + int uncc=(cond^1); + emit_byte(0x70+uncc); + emit_byte(7); /* skip next 7 bytes if not cc=true */ + emit_byte(0x8b); + } + emit_byte(0x04+8*d); + emit_byte(0x85+8*index); + emit_long(base); +} +LENDFUNC(NONE,READ,4,raw_cmov_l_rm_indexed,(W4 d, IMM base, R4 index, IMM cond)) + + LOWFUNC(NONE,READ,3,raw_cmov_l_rm,(W4 d, IMM mem, IMM cond)) +{ + if (have_cmov) { + emit_byte(0x0f); + emit_byte(0x40+cond); + emit_byte(0x05+8*d); + emit_long(mem); + } + else { /* replacement using branch and mov */ + int uncc=(cond^1); + emit_byte(0x70+uncc); + emit_byte(6); /* skip next 6 bytes if not cc=true */ + emit_byte(0x8b); + emit_byte(0x05+8*d); + emit_long(mem); + } +} +LENDFUNC(NONE,READ,3,raw_cmov_l_rm,(W4 d, IMM mem, IMM cond)) + + LOWFUNC(NONE,READ,3,raw_mov_l_rR,(W4 d, R4 s, IMM offset)) +{ + emit_byte(0x8b); + emit_byte(0x40+8*d+s); + emit_byte(offset); +} +LENDFUNC(NONE,READ,3,raw_mov_l_rR,(W4 d, R4 s, IMM offset)) + + LOWFUNC(NONE,READ,3,raw_mov_w_rR,(W2 d, R4 s, IMM offset)) +{ + emit_byte(0x66); + emit_byte(0x8b); + emit_byte(0x40+8*d+s); + emit_byte(offset); +} +LENDFUNC(NONE,READ,3,raw_mov_w_rR,(W2 d, R4 s, IMM offset)) + + LOWFUNC(NONE,READ,3,raw_mov_b_rR,(W1 d, R4 s, IMM offset)) +{ + emit_byte(0x8a); + emit_byte(0x40+8*d+s); + emit_byte(offset); +} +LENDFUNC(NONE,READ,3,raw_mov_b_rR,(W1 d, R4 s, IMM offset)) + + LOWFUNC(NONE,READ,3,raw_mov_l_brR,(W4 d, R4 s, IMM offset)) +{ + emit_byte(0x8b); + emit_byte(0x80+8*d+s); + emit_long(offset); +} +LENDFUNC(NONE,READ,3,raw_mov_l_brR,(W4 d, R4 s, IMM offset)) + + LOWFUNC(NONE,READ,3,raw_mov_w_brR,(W2 d, R4 s, IMM offset)) +{ + emit_byte(0x66); + emit_byte(0x8b); + emit_byte(0x80+8*d+s); + emit_long(offset); +} +LENDFUNC(NONE,READ,3,raw_mov_w_brR,(W2 d, R4 s, IMM offset)) + + LOWFUNC(NONE,READ,3,raw_mov_b_brR,(W1 d, R4 s, IMM offset)) +{ + emit_byte(0x8a); + emit_byte(0x80+8*d+s); + emit_long(offset); +} +LENDFUNC(NONE,READ,3,raw_mov_b_brR,(W1 d, R4 s, IMM offset)) + + LOWFUNC(NONE,WRITE,3,raw_mov_l_Ri,(R4 d, IMM i, IMM offset)) +{ + emit_byte(0xc7); + emit_byte(0x40+d); + emit_byte(offset); + emit_long(i); +} +LENDFUNC(NONE,WRITE,3,raw_mov_l_Ri,(R4 d, IMM i, IMM offset)) + + LOWFUNC(NONE,WRITE,3,raw_mov_w_Ri,(R4 d, IMM i, IMM offset)) +{ + emit_byte(0x66); + emit_byte(0xc7); + emit_byte(0x40+d); + emit_byte(offset); + emit_word(i); +} +LENDFUNC(NONE,WRITE,3,raw_mov_w_Ri,(R4 d, IMM i, IMM offset)) + + LOWFUNC(NONE,WRITE,3,raw_mov_b_Ri,(R4 d, IMM i, IMM offset)) +{ + emit_byte(0xc6); + emit_byte(0x40+d); + emit_byte(offset); + emit_byte(i); +} +LENDFUNC(NONE,WRITE,3,raw_mov_b_Ri,(R4 d, IMM i, IMM offset)) + + LOWFUNC(NONE,WRITE,3,raw_mov_l_Rr,(R4 d, R4 s, IMM offset)) +{ + emit_byte(0x89); + emit_byte(0x40+8*s+d); + emit_byte(offset); +} +LENDFUNC(NONE,WRITE,3,raw_mov_l_Rr,(R4 d, R4 s, IMM offset)) + + LOWFUNC(NONE,WRITE,3,raw_mov_w_Rr,(R4 d, R2 s, IMM offset)) +{ + emit_byte(0x66); + emit_byte(0x89); + emit_byte(0x40+8*s+d); + emit_byte(offset); +} +LENDFUNC(NONE,WRITE,3,raw_mov_w_Rr,(R4 d, R2 s, IMM offset)) + + LOWFUNC(NONE,WRITE,3,raw_mov_b_Rr,(R4 d, R1 s, IMM offset)) +{ + emit_byte(0x88); + emit_byte(0x40+8*s+d); + emit_byte(offset); +} +LENDFUNC(NONE,WRITE,3,raw_mov_b_Rr,(R4 d, R1 s, IMM offset)) + + LOWFUNC(NONE,NONE,3,raw_lea_l_brr,(W4 d, R4 s, IMM offset)) +{ + emit_byte(0x8d); + emit_byte(0x80+8*d+s); + emit_long(offset); +} +LENDFUNC(NONE,NONE,3,raw_lea_l_brr,(W4 d, R4 s, IMM offset)) + + LOWFUNC(NONE,NONE,5,raw_lea_l_brr_indexed,(W4 d, R4 s, R4 index, IMM factor, IMM offset)) +{ + emit_byte(0x8d); + if (!offset) { + if (s!=5) { + emit_byte(0x04+8*d); + emit_byte(0x40*factor+8*index+s); + return; + } + emit_byte(0x44+8*d); + emit_byte(0x40*factor+8*index+s); + emit_byte(0); + return; + } + emit_byte(0x84+8*d); + emit_byte(0x40*factor+8*index+s); + emit_long(offset); +} +LENDFUNC(NONE,NONE,5,raw_lea_l_brr_indexed,(W4 d, R4 s, R4 index, IMM factor, IMM offset)) + + LOWFUNC(NONE,NONE,3,raw_lea_l_rr_indexed,(W4 d, R4 s, R4 index)) +{ + emit_byte(0x8d); + if (s==5) { + emit_byte(0x44+8*d); + emit_byte(8*index+s); + emit_byte(0); + return; + } + emit_byte(0x04+8*d); + emit_byte(8*index+s); +} +LENDFUNC(NONE,NONE,3,raw_lea_l_rr_indexed,(W4 d, R4 s, R4 index)) + + LOWFUNC(NONE,WRITE,3,raw_mov_l_bRr,(R4 d, R4 s, IMM offset)) +{ + emit_byte(0x89); + emit_byte(0x80+8*s+d); + emit_long(offset); +} +LENDFUNC(NONE,WRITE,3,raw_mov_l_bRr,(R4 d, R4 s, IMM offset)) + + LOWFUNC(NONE,WRITE,3,raw_mov_w_bRr,(R4 d, R2 s, IMM offset)) +{ + emit_byte(0x66); + emit_byte(0x89); + emit_byte(0x80+8*s+d); + emit_long(offset); +} +LENDFUNC(NONE,WRITE,3,raw_mov_w_bRr,(R4 d, R2 s, IMM offset)) + + LOWFUNC(NONE,WRITE,3,raw_mov_b_bRr,(R4 d, R1 s, IMM offset)) +{ + emit_byte(0x88); + emit_byte(0x80+8*s+d); + emit_long(offset); +} +LENDFUNC(NONE,WRITE,3,raw_mov_b_bRr,(R4 d, R1 s, IMM offset)) + + LOWFUNC(NONE,NONE,1,raw_bswap_32,(RW4 r)) +{ + emit_byte(0x0f); + emit_byte(0xc8+r); +} +LENDFUNC(NONE,NONE,1,raw_bswap_32,(RW4 r)) + + LOWFUNC(WRITE,NONE,1,raw_bswap_16,(RW2 r)) +{ + emit_byte(0x66); + emit_byte(0xc1); + emit_byte(0xc0+r); + emit_byte(0x08); +} +LENDFUNC(WRITE,NONE,1,raw_bswap_16,(RW2 r)) + + LOWFUNC(NONE,NONE,2,raw_mov_l_rr,(W4 d, R4 s)) +{ + emit_byte(0x89); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(NONE,NONE,2,raw_mov_l_rr,(W4 d, R4 s)) + + LOWFUNC(NONE,WRITE,2,raw_mov_l_mr,(IMM d, R4 s)) +{ + emit_byte(0x89); + emit_byte(0x05+8*s); + emit_long(d); +} +LENDFUNC(NONE,WRITE,2,raw_mov_l_mr,(IMM d, R4 s)) + + LOWFUNC(NONE,READ,2,raw_mov_l_rm,(W4 d, MEMR s)) +{ + emit_byte(0x8b); + emit_byte(0x05+8*d); + emit_long(s); +} +LENDFUNC(NONE,READ,2,raw_mov_l_rm,(W4 d, MEMR s)) + + LOWFUNC(NONE,WRITE,2,raw_mov_w_mr,(IMM d, R2 s)) +{ + emit_byte(0x66); + emit_byte(0x89); + emit_byte(0x05+8*s); + emit_long(d); +} +LENDFUNC(NONE,WRITE,2,raw_mov_w_mr,(IMM d, R2 s)) + + LOWFUNC(NONE,READ,2,raw_mov_w_rm,(W2 d, IMM s)) +{ + emit_byte(0x66); + emit_byte(0x8b); + emit_byte(0x05+8*d); + emit_long(s); +} +LENDFUNC(NONE,READ,2,raw_mov_w_rm,(W2 d, IMM s)) + + LOWFUNC(NONE,WRITE,2,raw_mov_b_mr,(IMM d, R1 s)) +{ + emit_byte(0x88); + emit_byte(0x05+8*s); + emit_long(d); +} +LENDFUNC(NONE,WRITE,2,raw_mov_b_mr,(IMM d, R1 s)) + + LOWFUNC(NONE,READ,2,raw_mov_b_rm,(W1 d, IMM s)) +{ + emit_byte(0x8a); + emit_byte(0x05+8*d); + emit_long(s); +} +LENDFUNC(NONE,READ,2,raw_mov_b_rm,(W1 d, IMM s)) + + LOWFUNC(NONE,NONE,2,raw_mov_l_ri,(W4 d, IMM s)) +{ + emit_byte(0xb8+d); + emit_long(s); +} +LENDFUNC(NONE,NONE,2,raw_mov_l_ri,(W4 d, IMM s)) + + LOWFUNC(NONE,NONE,2,raw_mov_w_ri,(W2 d, IMM s)) +{ + emit_byte(0x66); + emit_byte(0xb8+d); + emit_word(s); +} +LENDFUNC(NONE,NONE,2,raw_mov_w_ri,(W2 d, IMM s)) + + LOWFUNC(NONE,NONE,2,raw_mov_b_ri,(W1 d, IMM s)) +{ + emit_byte(0xb0+d); + emit_byte(s); +} +LENDFUNC(NONE,NONE,2,raw_mov_b_ri,(W1 d, IMM s)) + + LOWFUNC(RMW,RMW,2,raw_adc_l_mi,(MEMRW d, IMM s)) +{ + emit_byte(0x81); + emit_byte(0x15); + emit_long(d); + emit_long(s); +} +LENDFUNC(RMW,RMW,2,raw_adc_l_mi,(MEMRW d, IMM s)) + + LOWFUNC(WRITE,RMW,2,raw_add_l_mi,(IMM d, IMM s)) +{ + emit_byte(0x81); + emit_byte(0x05); + emit_long(d); + emit_long(s); +} +LENDFUNC(WRITE,RMW,2,raw_add_l_mi,(IMM d, IMM s)) + + LOWFUNC(WRITE,RMW,2,raw_add_w_mi,(IMM d, IMM s)) +{ + emit_byte(0x66); + emit_byte(0x81); + emit_byte(0x05); + emit_long(d); + emit_word(s); +} +LENDFUNC(WRITE,RMW,2,raw_add_w_mi,(IMM d, IMM s)) + + LOWFUNC(WRITE,RMW,2,raw_add_b_mi,(IMM d, IMM s)) +{ + emit_byte(0x80); + emit_byte(0x05); + emit_long(d); + emit_byte(s); +} +LENDFUNC(WRITE,RMW,2,raw_add_b_mi,(IMM d, IMM s)) + + LOWFUNC(WRITE,NONE,2,raw_test_l_ri,(R4 d, IMM i)) +{ + emit_byte(0xf7); + emit_byte(0xc0+d); + emit_long(i); +} +LENDFUNC(WRITE,NONE,2,raw_test_l_ri,(R4 d, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_test_l_rr,(R4 d, R4 s)) +{ + emit_byte(0x85); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_test_l_rr,(R4 d, R4 s)) + + LOWFUNC(WRITE,NONE,2,raw_test_w_rr,(R2 d, R2 s)) +{ + emit_byte(0x66); + emit_byte(0x85); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_test_w_rr,(R2 d, R2 s)) + + LOWFUNC(WRITE,NONE,2,raw_test_b_rr,(R1 d, R1 s)) +{ + emit_byte(0x84); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_test_b_rr,(R1 d, R1 s)) + + LOWFUNC(WRITE,NONE,2,raw_and_l_ri,(RW4 d, IMM i)) +{ + emit_byte(0x81); + emit_byte(0xe0+d); + emit_long(i); +} +LENDFUNC(WRITE,NONE,2,raw_and_l_ri,(RW4 d, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_and_w_ri,(RW2 d, IMM i)) +{ + emit_byte(0x66); + emit_byte(0x81); + emit_byte(0xe0+d); + emit_word(i); +} +LENDFUNC(WRITE,NONE,2,raw_and_w_ri,(RW2 d, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_and_l,(RW4 d, R4 s)) +{ + emit_byte(0x21); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_and_l,(RW4 d, R4 s)) + + LOWFUNC(WRITE,NONE,2,raw_and_w,(RW2 d, R2 s)) +{ + emit_byte(0x66); + emit_byte(0x21); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_and_w,(RW2 d, R2 s)) + + LOWFUNC(WRITE,NONE,2,raw_and_b,(RW1 d, R1 s)) +{ + emit_byte(0x20); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_and_b,(RW1 d, R1 s)) + + LOWFUNC(WRITE,NONE,2,raw_or_l_ri,(RW4 d, IMM i)) +{ + emit_byte(0x81); + emit_byte(0xc8+d); + emit_long(i); +} +LENDFUNC(WRITE,NONE,2,raw_or_l_ri,(RW4 d, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_or_l,(RW4 d, R4 s)) +{ + emit_byte(0x09); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_or_l,(RW4 d, R4 s)) + + LOWFUNC(WRITE,NONE,2,raw_or_w,(RW2 d, R2 s)) +{ + emit_byte(0x66); + emit_byte(0x09); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_or_w,(RW2 d, R2 s)) + + LOWFUNC(WRITE,NONE,2,raw_or_b,(RW1 d, R1 s)) +{ + emit_byte(0x08); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_or_b,(RW1 d, R1 s)) + + LOWFUNC(RMW,NONE,2,raw_adc_l,(RW4 d, R4 s)) +{ + emit_byte(0x11); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(RMW,NONE,2,raw_adc_l,(RW4 d, R4 s)) + + LOWFUNC(RMW,NONE,2,raw_adc_w,(RW2 d, R2 s)) +{ + emit_byte(0x66); + emit_byte(0x11); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(RMW,NONE,2,raw_adc_w,(RW2 d, R2 s)) + + LOWFUNC(RMW,NONE,2,raw_adc_b,(RW1 d, R1 s)) +{ + emit_byte(0x10); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(RMW,NONE,2,raw_adc_b,(RW1 d, R1 s)) + + LOWFUNC(WRITE,NONE,2,raw_add_l,(RW4 d, R4 s)) +{ + emit_byte(0x01); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_add_l,(RW4 d, R4 s)) + + LOWFUNC(WRITE,NONE,2,raw_add_w,(RW2 d, R2 s)) +{ + emit_byte(0x66); + emit_byte(0x01); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_add_w,(RW2 d, R2 s)) + + LOWFUNC(WRITE,NONE,2,raw_add_b,(RW1 d, R1 s)) +{ + emit_byte(0x00); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_add_b,(RW1 d, R1 s)) + + LOWFUNC(WRITE,NONE,2,raw_sub_l_ri,(RW4 d, IMM i)) +{ + if (isbyte(i)) { + emit_byte(0x83); + emit_byte(0xe8+d); + emit_byte(i); + } + else { + emit_byte(0x81); + emit_byte(0xe8+d); + emit_long(i); + } +} +LENDFUNC(WRITE,NONE,2,raw_sub_l_ri,(RW4 d, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_sub_b_ri,(RW1 d, IMM i)) +{ + emit_byte(0x80); + emit_byte(0xe8+d); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_sub_b_ri,(RW1 d, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_add_l_ri,(RW4 d, IMM i)) +{ + if (isbyte(i)) { + emit_byte(0x83); + emit_byte(0xc0+d); + emit_byte(i); + } + else { + emit_byte(0x81); + emit_byte(0xc0+d); + emit_long(i); + } +} +LENDFUNC(WRITE,NONE,2,raw_add_l_ri,(RW4 d, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_add_w_ri,(RW2 d, IMM i)) +{ + if (isbyte(i)) { + emit_byte(0x66); + emit_byte(0x83); + emit_byte(0xc0+d); + emit_byte(i); + } + else { + emit_byte(0x66); + emit_byte(0x81); + emit_byte(0xc0+d); + emit_word(i); + } +} +LENDFUNC(WRITE,NONE,2,raw_add_w_ri,(RW2 d, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_add_b_ri,(RW1 d, IMM i)) +{ + emit_byte(0x80); + emit_byte(0xc0+d); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_add_b_ri,(RW1 d, IMM i)) + + LOWFUNC(RMW,NONE,2,raw_sbb_l,(RW4 d, R4 s)) +{ + emit_byte(0x19); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(RMW,NONE,2,raw_sbb_l,(RW4 d, R4 s)) + + LOWFUNC(RMW,NONE,2,raw_sbb_w,(RW2 d, R2 s)) +{ + emit_byte(0x66); + emit_byte(0x19); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(RMW,NONE,2,raw_sbb_w,(RW2 d, R2 s)) + + LOWFUNC(RMW,NONE,2,raw_sbb_b,(RW1 d, R1 s)) +{ + emit_byte(0x18); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(RMW,NONE,2,raw_sbb_b,(RW1 d, R1 s)) + + LOWFUNC(WRITE,NONE,2,raw_sub_l,(RW4 d, R4 s)) +{ + emit_byte(0x29); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_sub_l,(RW4 d, R4 s)) + + LOWFUNC(WRITE,NONE,2,raw_sub_w,(RW2 d, R2 s)) +{ + emit_byte(0x66); + emit_byte(0x29); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_sub_w,(RW2 d, R2 s)) + + LOWFUNC(WRITE,NONE,2,raw_sub_b,(RW1 d, R1 s)) +{ + emit_byte(0x28); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_sub_b,(RW1 d, R1 s)) + + LOWFUNC(WRITE,NONE,2,raw_cmp_l,(R4 d, R4 s)) +{ + emit_byte(0x39); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_cmp_l,(R4 d, R4 s)) + + LOWFUNC(WRITE,NONE,2,raw_cmp_l_ri,(R4 r, IMM i)) +{ + emit_byte(0x81); + emit_byte(0xf8+r); + emit_long(i); +} +LENDFUNC(WRITE,NONE,2,raw_cmp_l_ri,(R4 r, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_cmp_w,(R2 d, R2 s)) +{ + emit_byte(0x66); + emit_byte(0x39); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_cmp_w,(R2 d, R2 s)) + + LOWFUNC(WRITE,NONE,2,raw_cmp_b_ri,(R1 d, IMM i)) +{ + emit_byte(0x80); + emit_byte(0xf8+d); + emit_byte(i); +} +LENDFUNC(WRITE,NONE,2,raw_cmp_b_ri,(R1 d, IMM i)) + + LOWFUNC(WRITE,NONE,2,raw_cmp_b,(R1 d, R1 s)) +{ + emit_byte(0x38); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_cmp_b,(R1 d, R1 s)) + + LOWFUNC(WRITE,NONE,2,raw_xor_l,(RW4 d, R4 s)) +{ + emit_byte(0x31); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_xor_l,(RW4 d, R4 s)) + + LOWFUNC(WRITE,NONE,2,raw_xor_w,(RW2 d, R2 s)) +{ + emit_byte(0x66); + emit_byte(0x31); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_xor_w,(RW2 d, R2 s)) + + LOWFUNC(WRITE,NONE,2,raw_xor_b,(RW1 d, R1 s)) +{ + emit_byte(0x30); + emit_byte(0xc0+8*s+d); +} +LENDFUNC(WRITE,NONE,2,raw_xor_b,(RW1 d, R1 s)) + + LOWFUNC(WRITE,RMW,2,raw_sub_l_mi,(MEMRW d, IMM s)) +{ + emit_byte(0x81); + emit_byte(0x2d); + emit_long(d); + emit_long(s); +} +LENDFUNC(WRITE,RMW,2,raw_sub_l_mi,(MEMRW d, IMM s)) + + LOWFUNC(WRITE,READ,2,raw_cmp_l_mi,(MEMR d, IMM s)) +{ + emit_byte(0x81); + emit_byte(0x3d); + emit_long(d); + emit_long(s); +} +LENDFUNC(WRITE,READ,2,raw_cmp_l_mi,(MEMR d, IMM s)) + + LOWFUNC(NONE,NONE,2,raw_xchg_l_rr,(RW4 r1, RW4 r2)) +{ + emit_byte(0x87); + emit_byte(0xc0+8*r1+r2); +} +LENDFUNC(NONE,NONE,2,raw_xchg_l_rr,(RW4 r1, RW4 r2)) + + LOWFUNC(READ,WRITE,0,raw_pushfl,(void)) +{ + emit_byte(0x9c); +} +LENDFUNC(READ,WRITE,0,raw_pushfl,(void)) + + LOWFUNC(WRITE,READ,0,raw_popfl,(void)) +{ + emit_byte(0x9d); +} +LENDFUNC(WRITE,READ,0,raw_popfl,(void)) + + /************************************************************************* + * Unoptimizable stuff --- jump * + *************************************************************************/ + + STATIC_INLINE void raw_call_r(R4 r) +{ + lopt_emit_all(); + emit_byte(0xff); + emit_byte(0xd0+r); +} + +STATIC_INLINE void raw_jmp_r(R4 r) +{ + lopt_emit_all(); + emit_byte(0xff); + emit_byte(0xe0+r); +} + +STATIC_INLINE void raw_jmp_m_indexed(uae_u32 base, uae_u32 r, uae_u32 m) +{ + int sib; + + switch (m) { + case 1: sib = 0x05; break; + case 2: sib = 0x45; break; + case 4: sib = 0x85; break; + case 8: sib = 0xC5; break; + default: abort(); + } + lopt_emit_all(); + emit_byte(0xff); + emit_byte(0x24); + emit_byte(8*r+sib); + emit_long(base); +} + +STATIC_INLINE void raw_jmp_m(uae_u32 base) +{ + lopt_emit_all(); + emit_byte(0xff); + emit_byte(0x25); + emit_long(base); +} + +STATIC_INLINE void raw_call(uae_u32 t) +{ + lopt_emit_all(); + emit_byte(0xe8); + emit_long(t-(uae_u32)target-4); +} + +STATIC_INLINE void raw_jmp(uae_u32 t) +{ + lopt_emit_all(); + emit_byte(0xe9); + emit_long(t-(uae_u32)target-4); +} + +STATIC_INLINE void raw_jl(uae_u32 t) +{ + lopt_emit_all(); + emit_byte(0x0f); + emit_byte(0x8c); + emit_long(t-(uae_u32)target-4); +} + +STATIC_INLINE void raw_jz(uae_u32 t) +{ + lopt_emit_all(); + emit_byte(0x0f); + emit_byte(0x84); + emit_long(t-(uae_u32)target-4); +} + +STATIC_INLINE void raw_jnz(uae_u32 t) +{ + lopt_emit_all(); + emit_byte(0x0f); + emit_byte(0x85); + emit_long(t-(uae_u32)target-4); +} + +STATIC_INLINE void raw_jnz_l_oponly(void) +{ + lopt_emit_all(); + emit_byte(0x0f); + emit_byte(0x85); +} + +STATIC_INLINE void raw_jcc_l_oponly(int cc) +{ + lopt_emit_all(); + emit_byte(0x0f); + emit_byte(0x80+cc); +} + +STATIC_INLINE void raw_jnz_b_oponly(void) +{ + lopt_emit_all(); + emit_byte(0x75); +} + +STATIC_INLINE void raw_jz_b_oponly(void) +{ + lopt_emit_all(); + emit_byte(0x74); +} + +STATIC_INLINE void raw_jmp_l_oponly(void) +{ + lopt_emit_all(); + emit_byte(0xe9); +} + +STATIC_INLINE void raw_jmp_b_oponly(void) +{ + lopt_emit_all(); + emit_byte(0xeb); +} + +STATIC_INLINE void raw_ret(void) +{ + lopt_emit_all(); + emit_byte(0xc3); +} + +STATIC_INLINE void raw_nop(void) +{ + lopt_emit_all(); + emit_byte(0x90); +} + + +/************************************************************************* +* Flag handling, to and fro UAE flag register * +*************************************************************************/ + + +#define FLAG_NREG1 0 /* Set to -1 if any register will do */ + +STATIC_INLINE void raw_flags_to_reg(int r) +{ + raw_lahf(0); /* Most flags in AH */ + //raw_setcc(r,0); /* V flag in AL */ + raw_setcc_m((uae_u32)live.state[FLAGTMP].mem,0); + +#if 1 /* Let's avoid those nasty partial register stalls */ + //raw_mov_b_mr((uae_u32)live.state[FLAGTMP].mem,r); + raw_mov_b_mr(((uae_u32)live.state[FLAGTMP].mem)+1,r+4); + //live.state[FLAGTMP].status=CLEAN; + live.state[FLAGTMP].status=INMEM; + live.state[FLAGTMP].realreg=-1; + /* We just "evicted" FLAGTMP. */ + if (live.nat[r].nholds!=1) { + /* Huh? */ + abort(); + } + live.nat[r].nholds=0; +#endif +} + +#define FLAG_NREG2 0 /* Set to -1 if any register will do */ +STATIC_INLINE void raw_reg_to_flags(int r) +{ + raw_cmp_b_ri(r,-127); /* set V */ + raw_sahf(0); +} + +/* Apparently, there are enough instructions between flag store and +flag reload to avoid the partial memory stall */ +STATIC_INLINE void raw_load_flagreg(uae_u32 target, uae_u32 r) +{ +#if 1 + raw_mov_l_rm(target,(uae_u32)live.state[r].mem); +#else + raw_mov_b_rm(target,(uae_u32)live.state[r].mem); + raw_mov_b_rm(target+4,((uae_u32)live.state[r].mem)+1); +#endif +} + +/* FLAGX is word-sized */ +STATIC_INLINE void raw_load_flagx(uae_u32 target, uae_u32 r) +{ + if (live.nat[target].canword) + raw_mov_w_rm(target,(uae_u32)live.state[r].mem); + else + raw_mov_l_rm(target,(uae_u32)live.state[r].mem); +} + +#define NATIVE_FLAG_Z 0x40 +#define NATIVE_CC_EQ 4 +STATIC_INLINE void raw_flags_set_zero(int f, int r, int t) +{ + // FIXME: this is really suboptimal + raw_pushfl(); + raw_pop_l_r(f); + raw_and_l_ri(f,~NATIVE_FLAG_Z); + raw_test_l_rr(r,r); + raw_mov_l_ri(r,0); + raw_mov_l_ri(t,NATIVE_FLAG_Z); + raw_cmov_l_rr(r,t,NATIVE_CC_EQ); + raw_or_l(f,r); + raw_push_l_r(f); + raw_popfl(); +} + +STATIC_INLINE void raw_inc_sp(int off) +{ + raw_add_l_ri(4,off); +} + + +/************************************************************************* +* Handling mistaken direct memory access * +*************************************************************************/ + +#ifdef UAE +#include "exception_handler.cpp" +#endif + + +/************************************************************************* +* Checking for CPU features * +*************************************************************************/ + +struct cpuinfo_x86 { + uae_u8 x86; // CPU family + uae_u8 x86_vendor; // CPU vendor + uae_u8 x86_processor; // CPU canonical processor type + uae_u8 x86_brand_id; // CPU BrandID if supported, yield 0 otherwise + uae_u32 x86_hwcap; + uae_u8 x86_model; + uae_u8 x86_mask; + int cpuid_level; // Maximum supported CPUID level, -1=no CPUID + char x86_vendor_id[16]; +}; +struct cpuinfo_x86 cpuinfo; + +enum { + X86_VENDOR_INTEL = 0, + X86_VENDOR_CYRIX = 1, + X86_VENDOR_AMD = 2, + X86_VENDOR_UMC = 3, + X86_VENDOR_NEXGEN = 4, + X86_VENDOR_CENTAUR = 5, + X86_VENDOR_RISE = 6, + X86_VENDOR_TRANSMETA = 7, + X86_VENDOR_NSC = 8, + X86_VENDOR_UNKNOWN = 0xff +}; + +enum { + X86_PROCESSOR_I386, /* 80386 */ + X86_PROCESSOR_I486, /* 80486DX, 80486SX, 80486DX[24] */ + X86_PROCESSOR_PENTIUM, + X86_PROCESSOR_PENTIUMPRO, + X86_PROCESSOR_K6, + X86_PROCESSOR_ATHLON, + X86_PROCESSOR_PENTIUM4, + X86_PROCESSOR_K8, + X86_PROCESSOR_max +}; + +static struct ptt { + const int align_loop; + const int align_loop_max_skip; + const int align_jump; + const int align_jump_max_skip; + const int align_func; +} +x86_alignments[X86_PROCESSOR_max + 1] = { + { 4, 3, 4, 3, 4 }, + { 16, 15, 16, 15, 16 }, + { 16, 7, 16, 7, 16 }, + { 16, 15, 16, 7, 16 }, + { 32, 7, 32, 7, 32 }, + { 16, 7, 16, 7, 16 }, + { 0, 0, 0, 0, 0 }, + { 16, 7, 16, 7, 16 }, + { 0, 0, 0, 0, 0 } +}; + +static void + x86_get_cpu_vendor(struct cpuinfo_x86 *c) +{ + char *v = c->x86_vendor_id; + + if (!strcmp(v, "GenuineIntel")) + c->x86_vendor = X86_VENDOR_INTEL; + else if (!strcmp(v, "AuthenticAMD")) + c->x86_vendor = X86_VENDOR_AMD; + else if (!strcmp(v, "CyrixInstead")) + c->x86_vendor = X86_VENDOR_CYRIX; + else if (!strcmp(v, "Geode by NSC")) + c->x86_vendor = X86_VENDOR_NSC; + else if (!strcmp(v, "UMC UMC UMC ")) + c->x86_vendor = X86_VENDOR_UMC; + else if (!strcmp(v, "CentaurHauls")) + c->x86_vendor = X86_VENDOR_CENTAUR; + else if (!strcmp(v, "NexGenDriven")) + c->x86_vendor = X86_VENDOR_NEXGEN; + else if (!strcmp(v, "RiseRiseRise")) + c->x86_vendor = X86_VENDOR_RISE; + else if (!strcmp(v, "GenuineTMx86") || + !strcmp(v, "TransmetaCPU")) + c->x86_vendor = X86_VENDOR_TRANSMETA; + else + c->x86_vendor = X86_VENDOR_UNKNOWN; +} + +static void cpuid(uae_u32 op, uae_u32 *eax, uae_u32 *ebx, uae_u32 *ecx, uae_u32 *edx) +{ + const int CPUID_SPACE = 4096; + uae_u8* cpuid_space = (uae_u8*)cache_alloc(CPUID_SPACE); + if (cpuid_space == 0) + abort (); + static uae_u32 s_op, s_eax, s_ebx, s_ecx, s_edx; + uae_u8* tmp=get_target(); + + s_op = op; + set_target(cpuid_space); + raw_push_l_r(0); /* eax */ + raw_push_l_r(1); /* ecx */ + raw_push_l_r(2); /* edx */ + raw_push_l_r(3); /* ebx */ + raw_mov_l_rm(0,(uintptr)&s_op); + raw_cpuid(0); + raw_mov_l_mr((uintptr)&s_eax,0); + raw_mov_l_mr((uintptr)&s_ebx,3); + raw_mov_l_mr((uintptr)&s_ecx,1); + raw_mov_l_mr((uintptr)&s_edx,2); + raw_pop_l_r(3); + raw_pop_l_r(2); + raw_pop_l_r(1); + raw_pop_l_r(0); + raw_ret(); + set_target(tmp); + + ((compop_func*)cpuid_space)(0); + if (eax != NULL) *eax = s_eax; + if (ebx != NULL) *ebx = s_ebx; + if (ecx != NULL) *ecx = s_ecx; + if (edx != NULL) *edx = s_edx; + + cache_free (cpuid_space); +} + +static void raw_init_cpu(void) +{ + struct cpuinfo_x86 *c = &cpuinfo; + uae_u32 xlvl; + + /* Defaults */ + c->x86_processor = X86_PROCESSOR_max; + c->x86_vendor = X86_VENDOR_UNKNOWN; + c->cpuid_level = -1; /* CPUID not detected */ + c->x86_model = c->x86_mask = 0; /* So far unknown... */ + c->x86_vendor_id[0] = '\0'; /* Unset */ + c->x86_hwcap = 0; + + /* Get vendor name */ + c->x86_vendor_id[12] = '\0'; + cpuid(0x00000000, + (uae_u32 *)&c->cpuid_level, + (uae_u32 *)&c->x86_vendor_id[0], + (uae_u32 *)&c->x86_vendor_id[8], + (uae_u32 *)&c->x86_vendor_id[4]); + x86_get_cpu_vendor(c); + + /* Intel-defined flags: level 0x00000001 */ + c->x86_brand_id = 0; + if ( c->cpuid_level >= 0x00000001 ) { + uae_u32 tfms, brand_id; + cpuid(0x00000001, &tfms, &brand_id, NULL, &c->x86_hwcap); + c->x86 = (tfms >> 8) & 15; + c->x86_model = (tfms >> 4) & 15; + c->x86_brand_id = brand_id & 0xff; + if ( (c->x86_vendor == X86_VENDOR_AMD) && + (c->x86 == 0xf)) { + /* AMD Extended Family and Model Values */ + c->x86 += (tfms >> 20) & 0xff; + c->x86_model += (tfms >> 12) & 0xf0; + } + c->x86_mask = tfms & 15; + } else { + /* Have CPUID level 0 only - unheard of */ + c->x86 = 4; + } + + /* AMD-defined flags: level 0x80000001 */ + cpuid(0x80000000, &xlvl, NULL, NULL, NULL); + if ( (xlvl & 0xffff0000) == 0x80000000 ) { + if ( xlvl >= 0x80000001 ) { + uae_u32 features; + cpuid(0x80000001, NULL, NULL, NULL, &features); + if (features & (1 << 29)) { + /* Assume x86-64 if long mode is supported */ + c->x86_processor = X86_PROCESSOR_K8; + } + } + } + + /* Canonicalize processor ID */ + switch (c->x86) { + case 3: + c->x86_processor = X86_PROCESSOR_I386; + break; + case 4: + c->x86_processor = X86_PROCESSOR_I486; + break; + case 5: + if (c->x86_vendor == X86_VENDOR_AMD) + c->x86_processor = X86_PROCESSOR_K6; + else + c->x86_processor = X86_PROCESSOR_PENTIUM; + break; + case 6: + if (c->x86_vendor == X86_VENDOR_AMD) + c->x86_processor = X86_PROCESSOR_ATHLON; + else + c->x86_processor = X86_PROCESSOR_PENTIUMPRO; + break; + case 15: + if (c->x86_vendor == X86_VENDOR_INTEL) { + /* Assume any BrandID >= 8 and family == 15 yields a Pentium 4 */ + if (c->x86_brand_id >= 8) + c->x86_processor = X86_PROCESSOR_PENTIUM4; + } + if (c->x86_vendor == X86_VENDOR_AMD) { + /* Assume an Athlon processor if family == 15 and it was not + detected as an x86-64 so far */ + if (c->x86_processor == X86_PROCESSOR_max) + c->x86_processor = X86_PROCESSOR_ATHLON; + } + break; + } + + /* Have CMOV support? */ + have_cmov = c->x86_hwcap & (1 << 15); + +#if 0 + /* Can the host CPU suffer from partial register stalls? */ + have_rat_stall = (c->x86_vendor == X86_VENDOR_INTEL); + /* It appears that partial register writes are a bad idea even on + AMD K7 cores, even though they are not supposed to have the + dreaded rat stall. Why? Anyway, that's why we lie about it ;-) */ + if (c->x86_processor == X86_PROCESSOR_ATHLON) + have_rat_stall = 1; +#endif + have_rat_stall = 1; + + /* Alignments */ + if (tune_alignment) { + align_loops = x86_alignments[c->x86_processor].align_loop; + align_jumps = x86_alignments[c->x86_processor].align_jump; + } + { + TCHAR *s = au (c->x86_vendor_id); + write_log (_T("CPUID level=%d, Family=%d, Model=%d, Mask=%d, Vendor=%s [%d]\n"), + c->cpuid_level, c->x86, c->x86_model, c->x86_mask, s, c->x86_vendor); + xfree (s); + } +} + +#if 0 +static int target_check_bsf(void) +{ + int mismatch = 0; + for (int g_ZF = 0; g_ZF <= 1; g_ZF++) { + for (int g_CF = 0; g_CF <= 1; g_CF++) { + for (int g_OF = 0; g_OF <= 1; g_OF++) { + for (int g_SF = 0; g_SF <= 1; g_SF++) { + for (int value = -1; value <= 1; value++) { + unsigned long flags = (g_SF << 7) | (g_OF << 11) | (g_ZF << 6) | g_CF; + unsigned long tmp = value; + __asm__ __volatile__ ("push %0; popf; bsf %1,%1; pushf; pop %0" + : "+r" (flags), "+r" (tmp) : : "cc"); + int OF = (flags >> 11) & 1; + int SF = (flags >> 7) & 1; + int ZF = (flags >> 6) & 1; + int CF = flags & 1; + tmp = (value == 0); + if (ZF != tmp || SF != g_SF || OF != g_OF || CF != g_CF) + mismatch = true; + } + }}}} + if (mismatch) + write_log (_T("Target CPU defines all flags on BSF instruction\n")); + return !mismatch; +} +#endif + +#if 0 + +/************************************************************************* +* Checking for CPU features * +*************************************************************************/ + +typedef struct { + uae_u32 eax; + uae_u32 ecx; + uae_u32 edx; + uae_u32 ebx; +} x86_regs; + + +/* This could be so much easier if it could make assumptions about the +compiler... */ + +static uae_u32 cpuid_ptr; +static uae_u32 cpuid_level; + +static x86_regs cpuid(uae_u32 level) +{ + x86_regs answer; + uae_u8 *cpuid_space; + void* tmp=get_target(); + + cpuid_ptr=(uae_u32)&answer; + cpuid_level=level; + + cpuid_space = cache_alloc (256); + set_target(cpuid_space); + raw_push_l_r(0); /* eax */ + raw_push_l_r(1); /* ecx */ + raw_push_l_r(2); /* edx */ + raw_push_l_r(3); /* ebx */ + raw_push_l_r(7); /* edi */ + raw_mov_l_rm(0,(uae_u32)&cpuid_level); + raw_cpuid(0); + raw_mov_l_rm(7,(uae_u32)&cpuid_ptr); + raw_mov_l_Rr(7,0,0); + raw_mov_l_Rr(7,1,4); + raw_mov_l_Rr(7,2,8); + raw_mov_l_Rr(7,3,12); + raw_pop_l_r(7); + raw_pop_l_r(3); + raw_pop_l_r(2); + raw_pop_l_r(1); + raw_pop_l_r(0); + raw_ret(); + set_target(tmp); + + ((cpuop_func*)cpuid_space)(0); + cache_free (cpuid_space); + return answer; +} + +static void raw_init_cpu(void) +{ + x86_regs x; + uae_u32 maxlev; + + x=cpuid(0); + maxlev=x.eax; + write_log (_T("Max CPUID level=%d Processor is %c%c%c%c%c%c%c%c%c%c%c%c\n"), + maxlev, + x.ebx, + x.ebx>>8, + x.ebx>>16, + x.ebx>>24, + x.edx, + x.edx>>8, + x.edx>>16, + x.edx>>24, + x.ecx, + x.ecx>>8, + x.ecx>>16, + x.ecx>>24 + ); + have_rat_stall=(x.ecx==0x6c65746e); + + if (maxlev>=1) { + x=cpuid(1); + if (x.edx&(1<<15)) + have_cmov=1; + } + have_rat_stall=1; +#if 0 + if (!have_cmov) + have_rat_stall=0; +#endif +#if 0 + write_log (_T("have_cmov=%d, avoid_cmov=%d, have_rat_stall=%d\n"), + have_cmov,currprefs.avoid_cmov,have_rat_stall); + if (currprefs.avoid_cmov) { + write_log (_T("Disabling cmov use despite processor claiming to support it!\n")); + have_cmov=0; + } +#else + /* Dear Bernie, I don't want to keep around options which are useless, and not + represented in the GUI anymore... Is this okay? */ + write_log (_T("have_cmov=%d, have_rat_stall=%d\n"), have_cmov, have_rat_stall); +#endif +#if 0 /* For testing of non-cmov code! */ + have_cmov=0; +#endif +#if 0 /* It appears that partial register writes are a bad idea even on + AMD K7 cores, even though they are not supposed to have the + dreaded rat stall. Why? Anyway, that's why we lie about it ;-) */ + if (have_cmov) + have_rat_stall=1; +#endif +} +#endif + +/************************************************************************* +* FPU stuff * +*************************************************************************/ + + +STATIC_INLINE void raw_fp_init(void) +{ + int i; + + for (i=0;i1) { + emit_byte(0x9b); + emit_byte(0xdb); + emit_byte(0xe3); + live.tos=-1; + } +#endif + while (live.tos>=1) { + emit_byte(0xde); + emit_byte(0xd9); + live.tos-=2; + } + while (live.tos>=0) { + emit_byte(0xdd); + emit_byte(0xd8); + live.tos--; + } + raw_fp_init(); +} + +STATIC_INLINE void make_tos(int r) +{ + int p,q; + + if (live.spos[r]<0) { /* Register not yet on stack */ + emit_byte(0xd9); + emit_byte(0xe8); /* Push '1' on the stack, just to grow it */ + live.tos++; + live.spos[r]=live.tos; + live.onstack[live.tos]=r; + return; + } + /* Register is on stack */ + if (live.tos==live.spos[r]) + return; + p=live.spos[r]; + q=live.onstack[live.tos]; + + emit_byte(0xd9); + emit_byte(0xc8+live.tos-live.spos[r]); /* exchange it with top of stack */ + live.onstack[live.tos]=r; + live.spos[r]=live.tos; + live.onstack[p]=q; + live.spos[q]=p; +} + +STATIC_INLINE int stackpos(int r) +{ + if (live.spos[r]<0) + abort(); + if (live.tos=0) { + /* source is on top of stack, and we already have the dest */ + int dd=stackpos(d); + emit_byte(0xdd); + emit_byte(0xd0+dd); + } + else { + emit_byte(0xd9); + emit_byte(0xc0+ds); /* duplicate source on tos */ + tos_make(d); /* store to destination, pop if necessary */ + } +} +LENDFUNC(NONE,NONE,2,raw_fmov_rr,(FW d, FR s)) + + LOWFUNC(NONE,READ,2,raw_fldcw_m_indexed,(R4 index, IMM base)) +{ + emit_byte(0xd9); + emit_byte(0xa8+index); + emit_long(base); +} +LENDFUNC(NONE,READ,2,raw_fldcw_m_indexed,(R4 index, IMM base)) + + LOWFUNC(NONE,NONE,2,raw_fsqrt_rr,(FW d, FR s)) +{ + int ds; + + if (d!=s) { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + emit_byte(0xd9); + emit_byte(0xfa); /* fsqrt sqrt(x) */ + tos_make(d); /* store to destination */ + } + else { + make_tos(d); + emit_byte(0xd9); + emit_byte(0xfa); /* fsqrt y=sqrt(x) */ + } +} +LENDFUNC(NONE,NONE,2,raw_fsqrt_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_fabs_rr,(FW d, FR s)) +{ + int ds; + + if (d!=s) { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + emit_byte(0xd9); + emit_byte(0xe1); /* fabs abs(x) */ + tos_make(d); /* store to destination */ + } + else { + make_tos(d); + emit_byte(0xd9); + emit_byte(0xe1); /* fabs y=abs(x) */ + } +} +LENDFUNC(NONE,NONE,2,raw_fabs_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_frndint_rr,(FW d, FR s)) +{ + int ds; + + if (d!=s) { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + emit_byte(0xd9); + emit_byte(0xfc); /* frndint int(x) */ + tos_make(d); /* store to destination */ + } + else { + make_tos(d); + emit_byte(0xd9); + emit_byte(0xfc); /* frndint y=int(x) */ + } +} +LENDFUNC(NONE,NONE,2,raw_frndint_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_fgetexp_rr,(FW d, FR s)) +{ + int ds; + + if (d!=s) { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + emit_byte(0xd9); + emit_byte(0xf4); /* fxtract exp push man */ + emit_byte(0xdd); + emit_byte(0xd8); /* fstp just pop man */ + tos_make(d); /* store exp to destination */ + } + else { + make_tos(d); /* tos=x=y */ + emit_byte(0xd9); + emit_byte(0xf4); /* fxtract exp push man */ + emit_byte(0xdd); + emit_byte(0xd8); /* fstp just pop man */ + } +} +LENDFUNC(NONE,NONE,2,raw_fgetexp_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_fgetman_rr,(FW d, FR s)) +{ + int ds; + + if (d!=s) { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + emit_byte(0xd9); + emit_byte(0xf4); /* fxtract exp push man */ + emit_byte(0xdd); + emit_byte(0xd9); /* fstp copy man up & pop */ + tos_make(d); /* store man to destination */ + } + else { + make_tos(d); /* tos=x=y */ + emit_byte(0xd9); + emit_byte(0xf4); /* fxtract exp push man */ + emit_byte(0xdd); + emit_byte(0xd9); /* fstp copy man up & pop */ + } +} +LENDFUNC(NONE,NONE,2,raw_fgetman_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_fsin_rr,(FW d, FR s)) +{ + int ds; + + if (d!=s) { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + emit_byte(0xd9); + emit_byte(0xfe); /* fsin sin(x) */ + tos_make(d); /* store to destination */ + } + else { + make_tos(d); + emit_byte(0xd9); + emit_byte(0xfe); /* fsin y=sin(x) */ + } +} +LENDFUNC(NONE,NONE,2,raw_fsin_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_fcos_rr,(FW d, FR s)) +{ + int ds; + + if (d!=s) { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + emit_byte(0xd9); + emit_byte(0xff); /* fcos cos(x) */ + tos_make(d); /* store to destination */ + } + else { + make_tos(d); + emit_byte(0xd9); + emit_byte(0xff); /* fcos y=cos(x) */ + } +} +LENDFUNC(NONE,NONE,2,raw_fcos_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_ftan_rr,(FW d, FR s)) +{ + int ds; + + if (d!=s) { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + emit_byte(0xd9); + emit_byte(0xf2); /* fptan tan(x)=y/1.0 */ + emit_byte(0xdd); + emit_byte(0xd8); /* fstp pop 1.0 */ + tos_make(d); /* store to destination */ + } + else { + make_tos(d); + emit_byte(0xd9); + emit_byte(0xf2); /* fptan tan(x)=y/1.0 */ + emit_byte(0xdd); + emit_byte(0xd8); /* fstp pop 1.0 */ + } +} +LENDFUNC(NONE,NONE,2,raw_ftan_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,3,raw_fsincos_rr,(FW d, FW c, FR s)) +{ + int ds; + + if (s==d) { + //write_log (_T("FSINCOS src = dest\n")); + make_tos(s); + emit_byte(0xd9); + emit_byte(0xfb); /* fsincos sin(x) push cos(x) */ + tos_make(c); /* store cos(x) to c */ + return; + } + + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + emit_byte(0xd9); + emit_byte(0xfb); /* fsincos sin(x) push cos(x) */ + if (live.spos[c]<0) { + if (live.spos[d]<0) { /* occupy both regs directly */ + live.tos++; + live.spos[d]=live.tos; + live.onstack[live.tos]=d; /* sin(x) comes first */ + live.tos++; + live.spos[c]=live.tos; + live.onstack[live.tos]=c; + } + else { + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap cos(x) with sin(x) */ + emit_byte(0xdd); /* store sin(x) to d & pop */ + emit_byte(0xd8+(live.tos+2)-live.spos[d]); + live.tos++; /* occupy a reg for cos(x) here */ + live.spos[c]=live.tos; + live.onstack[live.tos]=c; + } + } + else { + emit_byte(0xdd); /* store cos(x) to c & pop */ + emit_byte(0xd8+(live.tos+2)-live.spos[c]); + tos_make(d); /* store sin(x) to destination */ + } +} +LENDFUNC(NONE,NONE,3,raw_fsincos_rr,(FW d, FW c, FR s)) + + float one=1; + +LOWFUNC(NONE,NONE,2,raw_fscale_rr,(FRW d, FR s)) +{ + int ds; + + if (live.spos[d]==live.tos && live.spos[s]==live.tos-1) { + //write_log (_T("fscale found x in TOS-1 and y in TOS\n")); + emit_byte(0xd9); + emit_byte(0xfd); /* fscale y*(2^x) */ + } + else { + make_tos(s); /* tos=x */ + ds=stackpos(d); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld y */ + emit_byte(0xd9); + emit_byte(0xfd); /* fscale y*(2^x) */ + tos_make(d); /* store y=y*(2^x) */ + } +} +LENDFUNC(NONE,NONE,2,raw_fscale_rr,(FRW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_ftwotox_rr,(FW d, FR s)) +{ + int ds; + + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + emit_byte(0xd9); + emit_byte(0xfc); /* frndint int(x) */ + emit_byte(0xd9); + emit_byte(0xc1+ds); /* fld x again */ + emit_byte(0xd8); + emit_byte(0xe1); /* fsub frac(x) = x - int(x) */ + emit_byte(0xd9); + emit_byte(0xf0); /* f2xm1 (2^frac(x))-1 */ + emit_byte(0xd8); + emit_byte(0x05); + emit_long((uae_u32)&one); /* fadd (2^frac(x))-1 + 1 */ + emit_byte(0xd9); + emit_byte(0xfd); /* fscale (2^frac(x))*2^int(x) */ + emit_byte(0xdd); + emit_byte(0xd9); /* fstp copy & pop */ + tos_make(d); /* store y=2^x */ +} +LENDFUNC(NONE,NONE,2,raw_ftwotox_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_fetox_rr,(FW d, FR s)) +{ + int ds; + + if (s==d) + make_tos(s); + else { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + } + emit_byte(0xd9); + emit_byte(0xea); /* fldl2e log2(e) */ + emit_byte(0xd8); + emit_byte(0xc9); /* fmul x*log2(e) */ + emit_byte(0xdd); + emit_byte(0xd1); /* fst copy up */ + emit_byte(0xd9); + emit_byte(0xfc); /* frndint int(x*log2(e)) */ + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap top two elements */ + emit_byte(0xd8); + emit_byte(0xe1); /* fsub x*log2(e) - int(x*log2(e)) */ + emit_byte(0xd9); + emit_byte(0xf0); /* f2xm1 (2^frac(x))-1 */ + emit_byte(0xd8); + emit_byte(0x05); + emit_long((uae_u32)&one); /* fadd (2^frac(x))-1 + 1 */ + emit_byte(0xd9); + emit_byte(0xfd); /* fscale (2^frac(x))*2^int(x*log2(e)) */ + emit_byte(0xdd); + emit_byte(0xd9); /* fstp copy & pop */ + if (s!=d) + tos_make(d); /* store y=e^x */ +} +LENDFUNC(NONE,NONE,2,raw_fetox_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_fetoxM1_rr,(FW d, FR s)) +{ + int ds; + + if (s==d) + make_tos(s); + else { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + } + emit_byte(0xd9); + emit_byte(0xea); /* fldl2e log2(e) */ + emit_byte(0xd8); + emit_byte(0xc9); /* fmul x*log2(e) */ + emit_byte(0xdd); + emit_byte(0xd1); /* fst copy up */ + emit_byte(0xd9); + emit_byte(0xfc); /* frndint int(x*log2(e)) */ + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap top two elements */ + emit_byte(0xd8); + emit_byte(0xe1); /* fsub x*log2(e) - int(x*log2(e)) */ + emit_byte(0xd9); + emit_byte(0xf0); /* f2xm1 (2^frac(x))-1 */ + emit_byte(0xd9); + emit_byte(0xfd); /* fscale ((2^frac(x))-1)*2^int(x*log2(e)) */ + emit_byte(0xdd); + emit_byte(0xd9); /* fstp copy & pop */ + if (s!=d) + tos_make(d); /* store y=(e^x)-1 */ +} +LENDFUNC(NONE,NONE,2,raw_fetoxM1_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_ftentox_rr,(FW d, FR s)) +{ + int ds; + + if (s==d) + make_tos(s); + else { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + } + emit_byte(0xd9); + emit_byte(0xe9); /* fldl2t log2(10) */ + emit_byte(0xd8); + emit_byte(0xc9); /* fmul x*log2(10) */ + emit_byte(0xdd); + emit_byte(0xd1); /* fst copy up */ + emit_byte(0xd9); + emit_byte(0xfc); /* frndint int(x*log2(10)) */ + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap top two elements */ + emit_byte(0xd8); + emit_byte(0xe1); /* fsub x*log2(10) - int(x*log2(10)) */ + emit_byte(0xd9); + emit_byte(0xf0); /* f2xm1 (2^frac(x))-1 */ + emit_byte(0xd8); + emit_byte(0x05); + emit_long((uae_u32)&one); /* fadd (2^frac(x))-1 + 1 */ + emit_byte(0xd9); + emit_byte(0xfd); /* fscale (2^frac(x))*2^int(x*log2(10)) */ + emit_byte(0xdd); + emit_byte(0xd9); /* fstp copy & pop */ + if (s!=d) + tos_make(d); /* store y=10^x */ +} +LENDFUNC(NONE,NONE,2,raw_ftentox_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_flog2_rr,(FW d, FR s)) +{ + int ds; + + if (s==d) + make_tos(s); + else { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + } + emit_byte(0xd9); + emit_byte(0xe8); /* fld1 1 */ + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap 1 with x */ + emit_byte(0xd9); + emit_byte(0xf1); /* fyl2x 1*log2(x) */ + if (s!=d) + tos_make(d); /* store y=log2(x) */ +} +LENDFUNC(NONE,NONE,2,raw_flog2_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_flogN_rr,(FW d, FR s)) +{ + int ds; + + if (s==d) + make_tos(s); + else { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + } + emit_byte(0xd9); + emit_byte(0xed); /* fldln2 logN(2) */ + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap logN(2) with x */ + emit_byte(0xd9); + emit_byte(0xf1); /* fyl2x logN(2)*log2(x) */ + if (s!=d) + tos_make(d); /* store y=logN(x) */ +} +LENDFUNC(NONE,NONE,2,raw_flogN_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_flogNP1_rr,(FW d, FR s)) +{ + int ds; + + if (s==d) + make_tos(s); + else { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + } + emit_byte(0xd9); + emit_byte(0xed); /* fldln2 logN(2) */ + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap logN(2) with x */ + emit_byte(0xd9); + emit_byte(0xf9); /* fyl2xp1 logN(2)*log2(x+1) */ + if (s!=d) + tos_make(d); /* store y=logN(x+1) */ +} +LENDFUNC(NONE,NONE,2,raw_flogNP1_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_flog10_rr,(FW d, FR s)) +{ + int ds; + + if (s==d) + make_tos(s); + else { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + } + emit_byte(0xd9); + emit_byte(0xec); /* fldlg2 log10(2) */ + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap log10(2) with x */ + emit_byte(0xd9); + emit_byte(0xf1); /* fyl2x log10(2)*log2(x) */ + if (s!=d) + tos_make(d); /* store y=log10(x) */ +} +LENDFUNC(NONE,NONE,2,raw_flog10_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_fasin_rr,(FW d, FR s)) +{ + int ds; + + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + emit_byte(0xd8); + emit_byte(0xc8); /* fmul x*x */ + emit_byte(0xd9); + emit_byte(0xe8); /* fld 1.0 */ + emit_byte(0xde); + emit_byte(0xe1); /* fsubrp 1 - (x^2) */ + emit_byte(0xd9); + emit_byte(0xfa); /* fsqrt sqrt(1-(x^2)) */ + emit_byte(0xd9); + emit_byte(0xc1+ds); /* fld x again */ + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap x with sqrt(1-(x^2)) */ + emit_byte(0xd9); + emit_byte(0xf3); /* fpatan atan(x/sqrt(1-(x^2))) & pop */ + tos_make(d); /* store y=asin(x) */ +} +LENDFUNC(NONE,NONE,2,raw_fasin_rr,(FW d, FR s)) + + static uae_u32 pihalf[] = {0x2168c234, 0xc90fdaa2, 0x3fff}; // LSB=0 to get acos(1)=0 +LOWFUNC(NONE,NONE,2,raw_facos_rr,(FW d, FR s)) +{ + int ds; + + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + emit_byte(0xd8); + emit_byte(0xc8); /* fmul x*x */ + emit_byte(0xd9); + emit_byte(0xe8); /* fld 1.0 */ + emit_byte(0xde); + emit_byte(0xe1); /* fsubrp 1 - (x^2) */ + emit_byte(0xd9); + emit_byte(0xfa); /* fsqrt sqrt(1-(x^2)) */ + emit_byte(0xd9); + emit_byte(0xc1+ds); /* fld x again */ + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap x with sqrt(1-(x^2)) */ + emit_byte(0xd9); + emit_byte(0xf3); /* fpatan atan(x/sqrt(1-(x^2))) & pop */ + emit_byte(0xdb); + emit_byte(0x2d); + emit_long((uae_u32)&pihalf); /* fld load pi/2 from pihalf */ + emit_byte(0xde); + emit_byte(0xe1); /* fsubrp pi/2 - asin(x) & pop */ + tos_make(d); /* store y=acos(x) */ +} +LENDFUNC(NONE,NONE,2,raw_facos_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_fatan_rr,(FW d, FR s)) +{ + int ds; + + if (s==d) + make_tos(s); + else { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + } + emit_byte(0xd9); + emit_byte(0xe8); /* fld 1.0 */ + emit_byte(0xd9); + emit_byte(0xf3); /* fpatan atan(x)/1 & pop*/ + if (s!=d) + tos_make(d); /* store y=atan(x) */ +} +LENDFUNC(NONE,NONE,2,raw_fatan_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_fatanh_rr,(FW d, FR s)) +{ + int ds; + + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + emit_byte(0xd9); + emit_byte(0xe8); /* fld 1.0 */ + emit_byte(0xdc); + emit_byte(0xc1); /* fadd 1 + x */ + emit_byte(0xd8); + emit_byte(0xe2+ds); /* fsub 1 - x */ + emit_byte(0xde); + emit_byte(0xf9); /* fdivp (1+x)/(1-x) */ + emit_byte(0xd9); + emit_byte(0xed); /* fldl2e logN(2) */ + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap logN(2) with (1+x)/(1-x) */ + emit_byte(0xd9); + emit_byte(0xf1); /* fyl2x logN(2)*log2((1+x)/(1-x)) pop */ + emit_byte(0xd9); + emit_byte(0xe8); /* fld 1.0 */ + emit_byte(0xd9); + emit_byte(0xe0); /* fchs -1.0 */ + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap */ + emit_byte(0xd9); + emit_byte(0xfd); /* fscale logN((1+x)/(1-x)) * 2^(-1) */ + emit_byte(0xdd); + emit_byte(0xd9); /* fstp copy & pop */ + tos_make(d); /* store y=atanh(x) */ +} +LENDFUNC(NONE,NONE,2,raw_fatanh_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_fsinh_rr,(FW d, FR s)) +{ + int ds,tr; + + tr=live.onstack[live.tos+3]; + if (s==d) + make_tos(s); + else { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + } + emit_byte(0xd9); + emit_byte(0xea); /* fldl2e log2(e) */ + emit_byte(0xd8); + emit_byte(0xc9); /* fmul x*log2(e) */ + emit_byte(0xdd); + emit_byte(0xd1); /* fst copy x*log2(e) */ + if (tr>=0) { + emit_byte(0xd9); + emit_byte(0xca); /* fxch swap with temp-reg */ + emit_byte(0x83); + emit_byte(0xc4); + emit_byte(0xf4); /* add -12 to esp */ + emit_byte(0xdb); + emit_byte(0x3c); + emit_byte(0x24); /* fstp store temp-reg to [esp] & pop */ + } + emit_byte(0xd9); + emit_byte(0xe0); /* fchs -x*log2(e) */ + emit_byte(0xd9); + emit_byte(0xc0); /* fld -x*log2(e) again */ + emit_byte(0xd9); + emit_byte(0xfc); /* frndint int(-x*log2(e)) */ + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap */ + emit_byte(0xd8); + emit_byte(0xe1); /* fsub -x*log2(e) - int(-x*log2(e)) */ + emit_byte(0xd9); + emit_byte(0xf0); /* f2xm1 (2^frac(x))-1 */ + emit_byte(0xd8); + emit_byte(0x05); + emit_long((uae_u32)&one); /* fadd (2^frac(x))-1 + 1 */ + emit_byte(0xd9); + emit_byte(0xfd); /* fscale (2^frac(x))*2^int(x*log2(e)) */ + emit_byte(0xd9); + emit_byte(0xca); /* fxch swap e^-x with x*log2(e) in tr */ + emit_byte(0xdd); + emit_byte(0xd1); /* fst copy x*log2(e) */ + emit_byte(0xd9); + emit_byte(0xfc); /* frndint int(x*log2(e)) */ + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap */ + emit_byte(0xd8); + emit_byte(0xe1); /* fsub x*log2(e) - int(x*log2(e)) */ + emit_byte(0xd9); + emit_byte(0xf0); /* f2xm1 (2^frac(x))-1 */ + emit_byte(0xd8); + emit_byte(0x05); + emit_long((uae_u32)&one); /* fadd (2^frac(x))-1 + 1 */ + emit_byte(0xd9); + emit_byte(0xfd); /* fscale (2^frac(x))*2^int(x*log2(e)) */ + emit_byte(0xdd); + emit_byte(0xd9); /* fstp copy e^x & pop */ + if (tr>=0) { + emit_byte(0xdb); + emit_byte(0x2c); + emit_byte(0x24); /* fld load temp-reg from [esp] */ + emit_byte(0xd9); + emit_byte(0xca); /* fxch swap temp-reg with e^-x in tr */ + emit_byte(0xde); + emit_byte(0xe9); /* fsubp (e^x)-(e^-x) */ + emit_byte(0x83); + emit_byte(0xc4); + emit_byte(0x0c); /* delayed add +12 to esp */ + } + else { + emit_byte(0xde); + emit_byte(0xe1); /* fsubrp (e^x)-(e^-x) */ + } + emit_byte(0xd9); + emit_byte(0xe8); /* fld 1.0 */ + emit_byte(0xd9); + emit_byte(0xe0); /* fchs -1.0 */ + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap */ + emit_byte(0xd9); + emit_byte(0xfd); /* fscale ((e^x)-(e^-x))/2 */ + emit_byte(0xdd); + emit_byte(0xd9); /* fstp copy & pop */ + if (s!=d) + tos_make(d); /* store y=sinh(x) */ +} +LENDFUNC(NONE,NONE,2,raw_fsinh_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_fcosh_rr,(FW d, FR s)) +{ + int ds,tr; + + tr=live.onstack[live.tos+3]; + if (s==d) + make_tos(s); + else { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + } + emit_byte(0xd9); + emit_byte(0xea); /* fldl2e log2(e) */ + emit_byte(0xd8); + emit_byte(0xc9); /* fmul x*log2(e) */ + emit_byte(0xdd); + emit_byte(0xd1); /* fst copy x*log2(e) */ + if (tr>=0) { + emit_byte(0xd9); + emit_byte(0xca); /* fxch swap with temp-reg */ + emit_byte(0x83); + emit_byte(0xc4); + emit_byte(0xf4); /* add -12 to esp */ + emit_byte(0xdb); + emit_byte(0x3c); + emit_byte(0x24); /* fstp store temp-reg to [esp] & pop */ + } + emit_byte(0xd9); + emit_byte(0xe0); /* fchs -x*log2(e) */ + emit_byte(0xd9); + emit_byte(0xc0); /* fld -x*log2(e) again */ + emit_byte(0xd9); + emit_byte(0xfc); /* frndint int(-x*log2(e)) */ + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap */ + emit_byte(0xd8); + emit_byte(0xe1); /* fsub -x*log2(e) - int(-x*log2(e)) */ + emit_byte(0xd9); + emit_byte(0xf0); /* f2xm1 (2^frac(x))-1 */ + emit_byte(0xd8); + emit_byte(0x05); + emit_long((uae_u32)&one); /* fadd (2^frac(x))-1 + 1 */ + emit_byte(0xd9); + emit_byte(0xfd); /* fscale (2^frac(x))*2^int(x*log2(e)) */ + emit_byte(0xd9); + emit_byte(0xca); /* fxch swap e^-x with x*log2(e) in tr */ + emit_byte(0xdd); + emit_byte(0xd1); /* fst copy x*log2(e) */ + emit_byte(0xd9); + emit_byte(0xfc); /* frndint int(x*log2(e)) */ + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap */ + emit_byte(0xd8); + emit_byte(0xe1); /* fsub x*log2(e) - int(x*log2(e)) */ + emit_byte(0xd9); + emit_byte(0xf0); /* f2xm1 (2^frac(x))-1 */ + emit_byte(0xd8); + emit_byte(0x05); + emit_long((uae_u32)&one); /* fadd (2^frac(x))-1 + 1 */ + emit_byte(0xd9); + emit_byte(0xfd); /* fscale (2^frac(x))*2^int(x*log2(e)) */ + emit_byte(0xdd); + emit_byte(0xd9); /* fstp copy e^x & pop */ + if (tr>=0) { + emit_byte(0xdb); + emit_byte(0x2c); + emit_byte(0x24); /* fld load temp-reg from [esp] */ + emit_byte(0xd9); + emit_byte(0xca); /* fxch swap temp-reg with e^-x in tr */ + emit_byte(0x83); + emit_byte(0xc4); + emit_byte(0x0c); /* delayed add +12 to esp */ + } + emit_byte(0xde); + emit_byte(0xc1); /* faddp (e^x)+(e^-x) */ + emit_byte(0xd9); + emit_byte(0xe8); /* fld 1.0 */ + emit_byte(0xd9); + emit_byte(0xe0); /* fchs -1.0 */ + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap */ + emit_byte(0xd9); + emit_byte(0xfd); /* fscale ((e^x)+(e^-x))/2 */ + emit_byte(0xdd); + emit_byte(0xd9); /* fstp copy & pop */ + if (s!=d) + tos_make(d); /* store y=cosh(x) */ +} +LENDFUNC(NONE,NONE,2,raw_fcosh_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_ftanh_rr,(FW d, FR s)) +{ + int ds,tr; + + tr=live.onstack[live.tos+3]; + if (s==d) + make_tos(s); + else { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld x */ + } + emit_byte(0xd9); + emit_byte(0xea); /* fldl2e log2(e) */ + emit_byte(0xd8); + emit_byte(0xc9); /* fmul x*log2(e) */ + emit_byte(0xdd); + emit_byte(0xd1); /* fst copy x*log2(e) */ + if (tr>=0) { + emit_byte(0xd9); + emit_byte(0xca); /* fxch swap with temp-reg */ + emit_byte(0x83); + emit_byte(0xc4); + emit_byte(0xf4); /* add -12 to esp */ + emit_byte(0xdb); + emit_byte(0x3c); + emit_byte(0x24); /* fstp store temp-reg to [esp] & pop */ + } + emit_byte(0xd9); + emit_byte(0xe0); /* fchs -x*log2(e) */ + emit_byte(0xd9); + emit_byte(0xc0); /* fld -x*log2(e) again */ + emit_byte(0xd9); + emit_byte(0xfc); /* frndint int(-x*log2(e)) */ + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap */ + emit_byte(0xd8); + emit_byte(0xe1); /* fsub -x*log2(e) - int(-x*log2(e)) */ + emit_byte(0xd9); + emit_byte(0xf0); /* f2xm1 (2^frac(x))-1 */ + emit_byte(0xd8); + emit_byte(0x05); + emit_long((uae_u32)&one); /* fadd (2^frac(x))-1 + 1 */ + emit_byte(0xd9); + emit_byte(0xfd); /* fscale (2^frac(x))*2^int(x*log2(e)) */ + emit_byte(0xd9); + emit_byte(0xca); /* fxch swap e^-x with x*log2(e) */ + emit_byte(0xdd); + emit_byte(0xd1); /* fst copy x*log2(e) */ + emit_byte(0xd9); + emit_byte(0xfc); /* frndint int(x*log2(e)) */ + emit_byte(0xd9); + emit_byte(0xc9); /* fxch swap */ + emit_byte(0xd8); + emit_byte(0xe1); /* fsub x*log2(e) - int(x*log2(e)) */ + emit_byte(0xd9); + emit_byte(0xf0); /* f2xm1 (2^frac(x))-1 */ + emit_byte(0xd8); + emit_byte(0x05); + emit_long((uae_u32)&one); /* fadd (2^frac(x))-1 + 1 */ + emit_byte(0xd9); + emit_byte(0xfd); /* fscale (2^frac(x))*2^int(x*log2(e)) */ + emit_byte(0xdd); + emit_byte(0xd1); /* fst copy e^x */ + emit_byte(0xd8); + emit_byte(0xc2); /* fadd (e^x)+(e^-x) */ + emit_byte(0xd9); + emit_byte(0xca); /* fxch swap with e^-x */ + emit_byte(0xde); + emit_byte(0xe9); /* fsubp (e^x)-(e^-x) */ + if (tr>=0) { + emit_byte(0xdb); + emit_byte(0x2c); + emit_byte(0x24); /* fld load temp-reg from [esp] */ + emit_byte(0xd9); + emit_byte(0xca); /* fxch swap temp-reg with e^-x in tr */ + emit_byte(0xde); + emit_byte(0xf9); /* fdivp ((e^x)-(e^-x))/((e^x)+(e^-x)) */ + emit_byte(0x83); + emit_byte(0xc4); + emit_byte(0x0c); /* delayed add +12 to esp */ + } + else { + emit_byte(0xde); + emit_byte(0xf1); /* fdivrp ((e^x)-(e^-x))/((e^x)+(e^-x)) */ + } + if (s!=d) + tos_make(d); /* store y=tanh(x) */ +} +LENDFUNC(NONE,NONE,2,raw_ftanh_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_fneg_rr,(FW d, FR s)) +{ + int ds; + + if (d!=s) { + ds=stackpos(s); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* duplicate source */ + emit_byte(0xd9); + emit_byte(0xe0); /* take fchs */ + tos_make(d); /* store to destination */ + } + else { + make_tos(d); + emit_byte(0xd9); + emit_byte(0xe0); /* take fchs */ + } +} +LENDFUNC(NONE,NONE,2,raw_fneg_rr,(FW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_fadd_rr,(FRW d, FR s)) +{ + int ds; + + if (live.spos[s]==live.tos) { + /* Source is on top of stack */ + ds=stackpos(d); + emit_byte(0xdc); + emit_byte(0xc0+ds); /* add source to dest*/ + } + else { + make_tos(d); + ds=stackpos(s); + + emit_byte(0xd8); + emit_byte(0xc0+ds); /* add source to dest*/ + } +} +LENDFUNC(NONE,NONE,2,raw_fadd_rr,(FRW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_fsub_rr,(FRW d, FR s)) +{ + int ds; + + if (live.spos[s]==live.tos) { + /* Source is on top of stack */ + ds=stackpos(d); + emit_byte(0xdc); + emit_byte(0xe8+ds); /* sub source from dest*/ + } + else { + make_tos(d); + ds=stackpos(s); + + emit_byte(0xd8); + emit_byte(0xe0+ds); /* sub src from dest */ + } +} +LENDFUNC(NONE,NONE,2,raw_fsub_rr,(FRW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_fcmp_rr,(FR d, FR s)) +{ + int ds; + + make_tos(d); + ds=stackpos(s); + + emit_byte(0xdd); + emit_byte(0xe0+ds); /* cmp dest with source*/ +} +LENDFUNC(NONE,NONE,2,raw_fcmp_rr,(FR d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_fmul_rr,(FRW d, FR s)) +{ + int ds; + + if (live.spos[s]==live.tos) { + /* Source is on top of stack */ + ds=stackpos(d); + emit_byte(0xdc); + emit_byte(0xc8+ds); /* mul dest by source*/ + } + else { + make_tos(d); + ds=stackpos(s); + + emit_byte(0xd8); + emit_byte(0xc8+ds); /* mul dest by source*/ + } +} +LENDFUNC(NONE,NONE,2,raw_fmul_rr,(FRW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_fdiv_rr,(FRW d, FR s)) +{ + int ds; + + if (live.spos[s]==live.tos) { + /* Source is on top of stack */ + ds=stackpos(d); + emit_byte(0xdc); + emit_byte(0xf8+ds); /* div dest by source */ + } + else { + make_tos(d); + ds=stackpos(s); + + emit_byte(0xd8); + emit_byte(0xf0+ds); /* div dest by source*/ + } +} +LENDFUNC(NONE,NONE,2,raw_fdiv_rr,(FRW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_frem_rr,(FRW d, FR s)) +{ + int ds; + + if (live.spos[d]==live.tos && live.spos[s]==live.tos-1) { + //write_log (_T("frem found x in TOS-1 and y in TOS\n")); + emit_byte(0xd9); + emit_byte(0xf8); /* fprem rem(y/x) */ + } + else { + make_tos(s); /* tos=x */ + ds=stackpos(d); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld y */ + emit_byte(0xd9); + emit_byte(0xf8); /* fprem rem(y/x) */ + tos_make(d); /* store y=rem(y/x) */ + } +} +LENDFUNC(NONE,NONE,2,raw_frem_rr,(FRW d, FR s)) + + LOWFUNC(NONE,NONE,2,raw_frem1_rr,(FRW d, FR s)) +{ + int ds; + + if (live.spos[d]==live.tos && live.spos[s]==live.tos-1) { + //write_log (_T("frem1 found x in TOS-1 and y in TOS\n")); + emit_byte(0xd9); + emit_byte(0xf5); /* fprem1 rem1(y/x) */ + } + else { + make_tos(s); /* tos=x */ + ds=stackpos(d); + emit_byte(0xd9); + emit_byte(0xc0+ds); /* fld y */ + emit_byte(0xd9); + emit_byte(0xf5); /* fprem1 rem1(y/x) */ + tos_make(d); /* store y=rem(y/x) */ + } +} +LENDFUNC(NONE,NONE,2,raw_frem1_rr,(FRW d, FR s)) + + LOWFUNC(NONE,NONE,1,raw_ftst_r,(FR r)) +{ + make_tos(r); + emit_byte(0xd9); /* ftst */ + emit_byte(0xe4); +} +LENDFUNC(NONE,NONE,1,raw_ftst_r,(FR r)) + + STATIC_INLINE void raw_fflags_into_flags(int r) +{ + int p; + + usereg(r); + p=stackpos(r); + + emit_byte(0xd9); + emit_byte(0xee); /* Push 0 */ + emit_byte(0xd9); + emit_byte(0xc9+p); /* swap top two around */ + if (have_cmov) { + // gb-- fucomi is for P6 cores only, not K6-2 then... + emit_byte(0xdb); + emit_byte(0xe9+p); /* fucomi them */ + } + else { + emit_byte(0xdd); + emit_byte(0xe1+p); /* fucom them */ + emit_byte(0x9b); + emit_byte(0xdf); + emit_byte(0xe0); /* fstsw ax */ + raw_sahf(0); /* sahf */ + } + emit_byte(0xdd); + emit_byte(0xd9+p); /* store value back, and get rid of 0 */ +} diff --git a/jit/compemu_support.cpp b/jit/compemu_support.cpp index ace8ae6b..3f82bdc0 100644 --- a/jit/compemu_support.cpp +++ b/jit/compemu_support.cpp @@ -594,7 +594,7 @@ STATIC_INLINE uae_u8* get_target(void) * Getting the information about the target CPU * ********************************************************************/ -#include "compemu_raw_x86.cpp" +#include "codegen_x86.cpp" /******************************************************************** -- 2.47.3