From: Dimitris Panokostas Date: Tue, 19 May 2026 17:59:55 +0000 (+0200) Subject: Fix x64 FPU control word setup X-Git-Url: https://git.unchartedbackwaters.co.uk/w/?a=commitdiff_plain;h=c8ea70739e5cfa75ff37cdc0a5f5ef9f022c775a;p=francis%2Fwinuae.git Fix x64 FPU control word setup --- diff --git a/fpp_native.cpp b/fpp_native.cpp index 2c922965..83b69e0c 100644 --- a/fpp_native.cpp +++ b/fpp_native.cpp @@ -74,44 +74,54 @@ static int temp_prec; #if defined(CPU_i386) || defined(CPU_x86_64) -/* The main motivation for dynamically creating an x86(-64) function in - * memory is because MSVC (x64) does not allow you to use inline assembly, - * and the x86-64 versions of _control87/_controlfp functions only modifies - * SSE2 registers. */ - static uae_u16 x87_cw = 0; + +#if defined(_MSC_VER) && defined(WIN64) +extern "C" void _cdecl xfp_fldcw(uae_u16*); +#else static uae_u8 *x87_fldcw_code = NULL; typedef void (uae_cdecl *x87_fldcw_function)(void); +#endif void init_fpucw_x87(void) { +#if defined(_MSC_VER) && defined(WIN64) + /* xfp_fldcw() comes from fpux64_80.asm. It avoids a generated + * executable thunk that would otherwise need to encode x64 addresses. */ +#else if (x87_fldcw_code) { return; } x87_fldcw_code = (uae_u8 *) uae_vm_alloc( - uae_vm_page_size(), UAE_VM_32BIT, UAE_VM_READ_WRITE_EXECUTE); + uae_vm_page_size(), 0, UAE_VM_READ_WRITE_EXECUTE); uae_u8 *c = x87_fldcw_code; - /* mov eax,0x0 */ - *(c++) = 0xb8; - *(c++) = 0x00; - *(c++) = 0x00; - *(c++) = 0x00; - *(c++) = 0x00; #ifdef CPU_x86_64 - /* Address override prefix */ - *(c++) = 0x67; -#endif - /* fldcw WORD PTR [eax+addr] */ + uintptr_t addr = (uintptr_t)&x87_cw; + /* mov rax,addr */ + *(c++) = 0x48; + *(c++) = 0xb8; + for (int i = 0; i < 8; i++) { + *(c++) = (addr >> (i * 8)) & 0xff; + } + /* fldcw WORD PTR [rax] */ *(c++) = 0xd9; - *(c++) = 0xa8; + *(c++) = 0x28; +#else + /* mov eax,addr */ + *(c++) = 0xb8; *(c++) = (((uintptr_t) &x87_cw) ) & 0xff; *(c++) = (((uintptr_t) &x87_cw) >> 8) & 0xff; *(c++) = (((uintptr_t) &x87_cw) >> 16) & 0xff; *(c++) = (((uintptr_t) &x87_cw) >> 24) & 0xff; + /* fldcw WORD PTR [eax] */ + *(c++) = 0xd9; + *(c++) = 0x28; +#endif /* ret */ *(c++) = 0xc3; /* Write-protect the function */ uae_vm_protect(x87_fldcw_code, uae_vm_page_size(), UAE_VM_READ_EXECUTE); +#endif } static void set_fpucw_x87(uae_u32 m68k_cw) @@ -124,11 +134,11 @@ static void set_fpucw_x87(uae_u32 m68k_cw) static const unsigned int fp87_prec[4] = { _PC_53, _PC_24, _PC_53, 0 }; int round = (m68k_cw >> 4) & 3; #ifdef WIN64 - // x64 only sets SSE2, must also call x87_fldcw_code() to set FPU rounding mode. + // x64 only sets SSE2, must also load the x87 control word to set FPU rounding mode. _controlfp(ex | fp87_round[round], _MCW_RC); #else int prec = (m68k_cw >> 6) & 3; - // x86 sets both FPU and SSE2 rounding mode, don't need x87_fldcw_code() + // x86 sets both FPU and SSE2 rounding mode, don't need an explicit x87 load. _control87(ex | fp87_round[round] | fp87_prec[prec], _MCW_RC | _MCW_PC); return; #endif @@ -144,7 +154,9 @@ static void set_fpucw_x87(uae_u32 m68k_cw) 0x127f, 0x1e7f, 0x167f, 0x1a7f, /* undefined (Double) */ }; x87_cw = x87_cw_tab[(m68k_cw >> 4) & 0xf]; -#if defined(X86_MSVC_ASSEMBLY) && 0 +#if defined(_MSC_VER) && defined(WIN64) + xfp_fldcw(&x87_cw); +#elif defined(X86_MSVC_ASSEMBLY) && 0 __asm { fldcw word ptr x87_cw } #elif defined(__GNUC__) && 0 __asm__("fldcw %0" : : "m" (*&x87_cw)); diff --git a/od-win32/fpp_native_msvc_80bit.cpp b/od-win32/fpp_native_msvc_80bit.cpp index d5457867..f3e194d9 100644 --- a/od-win32/fpp_native_msvc_80bit.cpp +++ b/od-win32/fpp_native_msvc_80bit.cpp @@ -21,7 +21,6 @@ #include "newcpu.h" #include "fpp.h" #include "uae/attributes.h" -#include "uae/vm.h" #include "softfloat/softfloat-specialize.h" extern "C" @@ -117,43 +116,10 @@ static void fp_set_mode(uae_u32 m68k_cw) } -/* The main motivation for dynamically creating an x86(-64) function in -* memory is because MSVC (x64) does not allow you to use inline assembly, -* and the x86-64 versions of _control87/_controlfp functions only modifies -* SSE2 registers. */ - static uae_u16 x87_cw = 0; -static uae_u8 *x87_fldcw_code = NULL; -typedef void (uae_cdecl *x87_fldcw_function)(void); void init_fpucw_x87_80(void) { - if (x87_fldcw_code) { - return; - } - x87_fldcw_code = (uae_u8 *)uae_vm_alloc(uae_vm_page_size(), UAE_VM_32BIT, UAE_VM_READ_WRITE_EXECUTE); - uae_u8 *c = x87_fldcw_code; - /* mov eax,0x0 */ - *(c++) = 0xb8; - *(c++) = 0x00; - *(c++) = 0x00; - *(c++) = 0x00; - *(c++) = 0x00; -#ifdef CPU_x86_64 - /* Address override prefix */ - *(c++) = 0x67; -#endif - /* fldcw WORD PTR [eax+addr] */ - *(c++) = 0xd9; - *(c++) = 0xa8; - *(c++) = (((uintptr_t)&x87_cw)) & 0xff; - *(c++) = (((uintptr_t)&x87_cw) >> 8) & 0xff; - *(c++) = (((uintptr_t)&x87_cw) >> 16) & 0xff; - *(c++) = (((uintptr_t)&x87_cw) >> 24) & 0xff; - /* ret */ - *(c++) = 0xc3; - /* Write-protect the function */ - uae_vm_protect(x87_fldcw_code, uae_vm_page_size(), UAE_VM_READ_EXECUTE); } static void native_set_fpucw(uae_u32 m68k_cw) @@ -165,11 +131,11 @@ static void native_set_fpucw(uae_u32 m68k_cw) static const unsigned int fp87_prec[4] = { _PC_53, _PC_24, _PC_53, _PC_53 }; int round = (m68k_cw >> 4) & 3; #ifdef WIN64 - // x64 only sets SSE2, must also call x87_fldcw_code() to set FPU rounding mode. + // x64 only sets SSE2, must also load the x87 control word to set FPU rounding mode. _controlfp(ex | fp87_round[round], _MCW_RC); #else int prec = (m68k_cw >> 6) & 3; - // x86 sets both FPU and SSE2 rounding mode, don't need x87_fldcw_code() + // x86 sets both FPU and SSE2 rounding mode, don't need an explicit x87 load. _control87(ex | fp87_round[round] | fp87_prec[prec], _MCW_RC | _MCW_PC); return; #endif @@ -180,7 +146,7 @@ static void native_set_fpucw(uae_u32 m68k_cw) 0x127f, 0x1e7f, 0x167f, 0x1a7f, /* undefined (Double) */ }; x87_cw = x87_cw_tab[(m68k_cw >> 4) & 0xf]; - ((x87_fldcw_function)x87_fldcw_code)(); + xfp_fldcw(&x87_cw); } /* Functions for setting host/library modes and getting status */ diff --git a/od-win32/fpux64_80.asm b/od-win32/fpux64_80.asm index 81e63eef..92b03266 100644 --- a/od-win32/fpux64_80.asm +++ b/od-win32/fpux64_80.asm @@ -1,6 +1,10 @@ ; 64-bit assembly functions for native 80-bit FPU emulation +cpu x64 +bits 64 +default rel + global xfp_int global xfp_mov global xfp_fldcw @@ -53,8 +57,6 @@ global xfp_clear_status section .text -bits 64 - %macro loadfp1 0 fld tword[rdx] %endmacro diff --git a/od-win32/winuae_msvc15/winuae_msvc.vcxproj b/od-win32/winuae_msvc15/winuae_msvc.vcxproj index bfffae12..77a074d8 100644 --- a/od-win32/winuae_msvc15/winuae_msvc.vcxproj +++ b/od-win32/winuae_msvc15/winuae_msvc.vcxproj @@ -560,12 +560,11 @@ true - false - true + true + false true MachineX64 true - 0x10000000 /ignore:4099 %(AdditionalOptions) @@ -792,11 +791,10 @@ true - false - true + true + false true MachineX64 - 0x10000000 /ignore:4099 %(AdditionalOptions) @@ -1104,11 +1102,10 @@ true true UseLinkTimeCodeGeneration - false - true + true + false true MachineX64 - 0x10000000 /ignore:4099 %(AdditionalOptions) true @@ -2470,4 +2467,4 @@ - \ No newline at end of file +