From 2f8db1521dc9bd64998cb786088c4d01051beb85 Mon Sep 17 00:00:00 2001
From: Frode Solheim <frode@fs-uae.net>
Date: Sun, 11 Oct 2015 21:31:00 +0200
Subject: [PATCH] New (WinUAE64 compatible) implementation of native_set_fpucw

---
 fpp.cpp                  | 172 +++++++++++++++++++++++----------------
 include/uae/attributes.h |   8 ++
 2 files changed, 112 insertions(+), 68 deletions(-)

diff --git a/fpp.cpp b/fpp.cpp
index a9fc38c9..02df5deb 100644
--- a/fpp.cpp
+++ b/fpp.cpp
@@ -23,6 +23,8 @@
 
 #include "options.h"
 #include "memory.h"
+#include "uae/attributes.h"
+#include "uae/vm.h"
 #include "custom.h"
 #include "events.h"
 #include "newcpu.h"
@@ -37,11 +39,6 @@
 #include "softfloatx80.h"
 #endif
 
-#ifdef X86_MSVC_ASSEMBLY
-#define X86_MSVC_ASSEMBLY_FPU
-#define NATIVE_FPUCW
-#endif
-
 #define DEBUG_FPP 0
 #define EXCEPTION_FPP 1
 
@@ -505,80 +502,115 @@ bool fpu_get_constant(fpdata *fp, int cr)
 	return fpu_get_constant_fp(fp, cr);
 }
 
-static void native_set_fpucw (uae_u32 m68k_cw)
-{
 #ifdef WITH_SOFTFLOAT
-	if (currprefs.fpu_softfloat) {
-		switch((m68k_cw >> 6) & 3)
-		{
-			case 0: // X
-			default: // undefined
-				fxstatus.float_rounding_precision = 80;
-			break;
-			case 1: // S
-				fxstatus.float_rounding_precision = 32;
-			break;
-			case 2: // D
-				fxstatus.float_rounding_precision = 64;
-			break;
-		}
-		switch((m68k_cw >> 4) & 3)
-		{
-			case 0: // to neareset
-				fxstatus.float_rounding_precision = float_round_nearest_even;
-			break;
-			case 1: // to zero
-				fxstatus.float_rounding_mode = float_round_to_zero;
-			break;
-			case 2: // to minus
-				fxstatus.float_rounding_mode = float_round_down;
-			break;
-			case 3: // to plus
-				fxstatus.float_rounding_mode = float_round_up;
-			break;
-		}
-	} else
-#endif
-	{
-#ifdef NATIVE_FPUCW
-#ifdef _WIN32
-		static int ex = 0;
-		// RN, RZ, RM, RP
-		static const unsigned int fp87_round[4] = { _RC_NEAR, _RC_CHOP, _RC_DOWN, _RC_UP };
-		// Extend X, Single S, Double D, Undefined
-		static const unsigned int fp87_prec[4] = { _PC_64 , _PC_24 , _PC_53, 0 };
-#ifdef WIN64
-		_controlfp (ex | fp87_round[(m68k_cw >> 4) & 3], _MCW_RC);
-#else
-		_control87 (ex | fp87_round[(m68k_cw >> 4) & 3] | fp87_prec[(m68k_cw >> 6) & 3], _MCW_RC | _MCW_PC);
-#endif
-#else
+
+static inline void set_fpucw_softfloat(uae_u32 m68k_cw)
+{
+	switch((m68k_cw >> 6) & 3) {
+	case 0: // X
+	default: // undefined
+		fxstatus.float_rounding_precision = 80;
+		break;
+	case 1: // S
+		fxstatus.float_rounding_precision = 32;
+		break;
+	case 2: // D
+		fxstatus.float_rounding_precision = 64;
+		break;
+	}
+	switch((m68k_cw >> 4) & 3) {
+	case 0: // to neareset
+		fxstatus.float_rounding_precision = float_round_nearest_even;
+		break;
+	case 1: // to zero
+		fxstatus.float_rounding_mode = float_round_to_zero;
+		break;
+	case 2: // to minus
+		fxstatus.float_rounding_mode = float_round_down;
+		break;
+	case 3: // to plus
+		fxstatus.float_rounding_mode = float_round_up;
+		break;
+	}
+	return;
+}
+
+#endif /* WITH_SOFTFLOAT */
+
+#if defined(CPU_i386) || defined(CPU_x86_64)
+
+/* The main motivation for dynamically creating an x86(-64) function in
+ * memory is because MSVC (x64) does not allow you to use inline assembly,
+ * and the x86-64 versions of _control87/_controlfp functions only modifies
+ * SSE2 registers. */
+
+static uae_u16 x87_cw = 0;
+static char *x87_fldcw_code = NULL;
+typedef void (uae_cdecl *x87_fldcw_function)(void);
+
+static void init_fpucw_x87(void)
+{
+	if (x87_fldcw_code) {
+		return;
+	}
+	x87_fldcw_code = (char *) uae_vm_alloc(
+		uae_vm_page_size(), UAE_VM_32BIT, UAE_VM_READ_WRITE_EXECUTE);
+	char *c = x87_fldcw_code;
+	/* mov eax,0x0 */
+	*(c++) = 0xb8;
+	*(c++) = 0x00;
+	*(c++) = 0x00;
+	*(c++) = 0x00;
+	*(c++) = 0x00;
+#ifdef CPU_x86_64
+	/* Address override prefix */
+	*(c++) = 0x67;
+#endif
+	/* fldcw WORD PTR [eax+addr] */
+	*(c++) = 0xd9;
+	*(c++) = 0xa8;
+	*(c++) = (((uintptr_t) &x87_cw)      ) & 0xff;
+	*(c++) = (((uintptr_t) &x87_cw) >>  8) & 0xff;
+	*(c++) = (((uintptr_t) &x87_cw) >> 16) & 0xff;
+	*(c++) = (((uintptr_t) &x87_cw) >> 24) & 0xff;
+	/* ret */
+	*(c++) = 0xc3;
+	/* Write-protect the function */
+	uae_vm_protect(x87_fldcw_code, uae_vm_page_size(), UAE_VM_READ_EXECUTE);
+}
+
+static inline void set_fpucw_x87(uae_u32 m68k_cw)
+{
 	static const uae_u16 x87_cw_tab[] = {
 		0x137f, 0x1f7f, 0x177f, 0x1b7f,	/* Extended */
 		0x107f, 0x1c7f, 0x147f, 0x187f,	/* Single */
 		0x127f, 0x1e7f, 0x167f, 0x1a7f,	/* Double */
 		0x137f, 0x1f7f, 0x177f, 0x1b7f	/* undefined */
 	};
-#if USE_X86_FPUCW
-		uae_u16 x87_cw = x87_cw_tab[(m68k_cw >> 4) & 0xf];
-
-#if defined(X86_MSVC_ASSEMBLY)
-		__asm {
-			fldcw word ptr x87_cw
-		}
-#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
-		__asm__ ("fldcw %0" : : "m" (*&x87_cw));
+	x87_cw = x87_cw_tab[(m68k_cw >> 4) & 0xf];
+#if defined(X86_MSVC_ASSEMBLY) && 0
+	__asm { fldcw word ptr x87_cw }
+#elif defined(__GNUC__) && 0
+	__asm__("fldcw %0" : : "m" (*&x87_cw));
 #else
-		#warning floating point control not specified
-#endif
-#endif /* USE_X86_FPUCW */
+	((x87_fldcw_function) x87_fldcw_code)();
 #endif
-#else
-#ifndef _MSC_VER
-#warning NATIVE_FPUCW not enabled
+}
+
+#endif /* defined(CPU_i386) || defined(CPU_x86_64) */
+
+static void native_set_fpucw(uae_u32 m68k_cw)
+{
+#ifdef WITH_SOFTFLOAT
+	if (currprefs.fpu_softfloat) {
+		set_fpucw_softfloat(m68k_cw);
+		/* FIXME: consider removing return to *also* set x87 fpucw? */
+		return;
+	}
 #endif
+#if defined(CPU_i386) || defined(CPU_x86_64)
+	set_fpucw_x87(m68k_cw);
 #endif
-	}
 }
 
 typedef uae_s64 tointtype;
@@ -3102,6 +3134,10 @@ void fpuop_arithmetic (uae_u32 opcode, uae_u16 extra)
 
 void fpu_reset (void)
 {
+#if defined(CPU_i386) || defined(CPU_x86_64)
+	init_fpucw_x87();
+#endif
+
 	regs.fpcr = regs.fpsr = regs.fpiar = 0;
 	regs.fpu_exp_state = 0;
 	fpset (&regs.fp_result, 1);
diff --git a/include/uae/attributes.h b/include/uae/attributes.h
index 1047c845..584aa22e 100644
--- a/include/uae/attributes.h
+++ b/include/uae/attributes.h
@@ -12,6 +12,14 @@
 /* This file is intended to be included by external libraries as well,
  * so don't pull in too much UAE-specific stuff. */
 
+#ifdef _WIN32
+#define uae_cdecl __cdecl
+#elif defined(__GNUC__) && defined(__i386__)
+#define uae_cdecl __attribute__((cdecl))
+#else
+#define uae_cdecl
+#endif
+
 /* This attribute allows (some) compiles to emit warnings when incorrect
  * arguments are used with the format string. */
 
-- 
2.47.3