]> git.unchartedbackwaters.co.uk Git - francis/winuae.git/commitdiff
Another JIT shift instruction update.
authorToni Wilen <twilen@winuae.net>
Sat, 2 Jan 2021 14:51:30 +0000 (16:51 +0200)
committerToni Wilen <twilen@winuae.net>
Sat, 2 Jan 2021 14:51:30 +0000 (16:51 +0200)
jit/compemu_midfunc_x86.cpp
jit/compemu_midfunc_x86.h
jit/gencomp.cpp

index 413037309f442252f410bda4981c75153079b2da..25b0ad3e4ab334f7e3450143755bc215b01ff205 100644 (file)
@@ -134,11 +134,13 @@ MIDFUNC(0,clear_overflow,(void))
        raw_popfl();
 }
 
-MIDFUNC(3,setcc_for_cntzero,(RR4 /* cnt */, RR4 data, int size, int ov))
+// This is complex because x86 shift behavior is different than 680x0.
+// - shift count 0: does not modify any flags | clears C, modifies Z and N. Does not modify X.
+// - shift count larger or same than data size : C undefined | C always equals last bit shifted out.
+// - shift count mask: masked by 31 (except if 64bit data size) | masked by 63.
+MIDFUNC(6, setcc_for_cntzero, (RR4 /* cnt */, RR4 data, RR4 odata, int obit, int size, int ov))
 {
-       uae_u8 *branchadd1a, *branchadd1b;
-       uae_u8* branchadd2;
-       uae_u8* branchadd3;
+       uae_u8 *branchadd1, *branchadd2, *branchadd3, *branchadd4;
 
        evict(FLAGX);
        make_flags_live_internal();
@@ -157,63 +159,61 @@ MIDFUNC(3,setcc_for_cntzero,(RR4 /* cnt */, RR4 data, int size, int ov))
                emit_byte(0xff);
        }
 
-       /*
-        * shift count can only be in CL register; see shrl_b_rr
-        */
+       // Shift count can only be in CL register; see shrl_b_rr
+       // Zero shift count?
        raw_test_b_rr(X86_CL, X86_CL);
-       /* if zero, leave X unaffected; carry flag will already be cleared */
        raw_jz_b_oponly();
-       branchadd1a = get_target();
+       branchadd4 = get_target();
        skip_byte();
 
-       /* if >= 32, recalculate all flags */
-       raw_cmp_b_ri(X86_CL, 31);
-       raw_jcc_b_oponly(NATIVE_CC_HI);
-       branchadd1b = get_target();
-       skip_byte();
-
-       /* shift count was non-zero; update also x-flag */
-       raw_popfl();
-       COMPCALL(setcc_m)((uintptr)live.state[FLAGX].mem, NATIVE_CC_CS);
-       log_vwrite(FLAGX);
-       raw_jmp_b_oponly();
+       // Shift count lower than data size?
+       raw_cmp_b_ri(X86_CL, size == 0 ? 7 : (size == 1 ? 15 : 31));
+       raw_jcc_b_oponly(NATIVE_CC_LS);
        branchadd2 = get_target();
        skip_byte();
 
-       *branchadd1a = (uintptr)get_target() - ((uintptr)branchadd1a + 1);
-
-       /* shift count was zero; need to set Z & N flags since the native flags were unaffected */
+       *branchadd4 = (uintptr)get_target() - ((uintptr)branchadd4 + 1);
+       // Shift count: zero, same or larger than data size
+       // Need to update C, N and Z.
        raw_popfl();
-       data = readreg(data, size);
+       data = readreg(data, 4);
+       /* Update Z and N (Clears also C). */
        switch (size)
        {
-               case 1: raw_test_b_rr(data, data); break;
-               case 2: raw_test_w_rr(data, data); break;
-               case 4: raw_test_l_rr(data, data); break;
+       case 0: raw_test_b_rr(data, data); break;
+       case 1: raw_test_w_rr(data, data); break;
+       case 2: raw_test_l_rr(data, data); break;
        }
+       unlock2(data);
+       // Update C (BT does not modify other flags).
+       odata = readreg(odata, 4);
+       raw_bt_l_ri(odata, obit);
+       unlock2(odata);
+       raw_pushfl();
 
+       // If zero shift count: X must not be modified.
+       raw_test_b_rr(X86_CL, X86_CL);
+       raw_jz_b_oponly();
+       branchadd1 = get_target();
+       skip_byte();
+
+       // Non-zero shift count.
+       // Do not modify C, N and Z.
+       // C -> X
+       *branchadd2 = (uintptr)get_target() - ((uintptr)branchadd2 + 1);
+       raw_popfl();
+       // Execute "duplicate_carry()"
+       COMPCALL(setcc_m)((uintptr)live.state[FLAGX].mem, NATIVE_CC_CS);
+       log_vwrite(FLAGX);
        raw_jmp_b_oponly();
        branchadd3 = get_target();
        skip_byte();
-       *branchadd1b = (uintptr)get_target() - ((uintptr)branchadd1b + 1);
 
-       /* shift count was >=32, set all flags */
+       // Zero shift count after CNZ adjustments
+       *branchadd1 = (uintptr)get_target() - ((uintptr)branchadd1 + 1);
        raw_popfl();
-       /* Set Z and N */
-       switch (size)
-       {
-       case 1: raw_test_b_rr(data, data); break;
-       case 2: raw_test_w_rr(data, data); break;
-       case 4: raw_test_l_rr(data, data); break;
-       }
-       /* Set C */
-       raw_bt_l_ri(data, 0);
 
        *branchadd3 = (uintptr)get_target() - ((uintptr)branchadd3 + 1);
-
-       unlock2(data);
-
-       *branchadd2 = (uintptr)get_target() - ((uintptr)branchadd2 + 1);
 }
 
 /*
@@ -2234,6 +2234,15 @@ MIDFUNC(2,cmp_b,(RR1 d, RR1 s))
        unlock2(s);
 }
 
+MIDFUNC(2, cmp_b_ri, (RR1 r, IMM i))
+{
+       CLOBBER_CMP;
+       r = readreg(r, 1);
+
+       raw_cmp_b_ri(r, i);
+       unlock2(r);
+}
+
 
 MIDFUNC(2,xor_l,(RW4 d, RR4 s))
 {
index a5b6df560184b6aaca4ccc84dd462c71a55d951e..a25102ed5b2356169d9c2b9131ac4b5ffafc492f 100644 (file)
@@ -180,13 +180,14 @@ DECLARE_MIDFUNC(cmp_l(RR4 d, RR4 s));
 DECLARE_MIDFUNC(cmp_l_ri(RR4 r, IMM i));
 DECLARE_MIDFUNC(cmp_w(RR2 d, RR2 s));
 DECLARE_MIDFUNC(cmp_b(RR1 d, RR1 s));
+DECLARE_MIDFUNC(cmp_b_ri(RR1 r, IMM i));
 DECLARE_MIDFUNC(xor_l(RW4 d, RR4 s));
 DECLARE_MIDFUNC(xor_w(RW2 d, RR2 s));
 DECLARE_MIDFUNC(xor_b(RW1 d, RR1 s));
 DECLARE_MIDFUNC(live_flags(void));
 DECLARE_MIDFUNC(dont_care_flags(void));
 DECLARE_MIDFUNC(duplicate_carry(void));
-DECLARE_MIDFUNC(setcc_for_cntzero(RR4 d, RR4 data, int size, int ov));
+DECLARE_MIDFUNC(setcc_for_cntzero(RR4 d, RR4 data, RR4 odata, int obit, int size, int ov));
 DECLARE_MIDFUNC(clear_overflow(void));
 DECLARE_MIDFUNC(restore_carry(void));
 DECLARE_MIDFUNC(start_needflags(void));
index 55b108270c4ff0c30269c789ad5c7cc267a725ef..4142813ae9336beb0e48eaf783f46ac68868a208 100644 (file)
@@ -2320,41 +2320,55 @@ gen_opcode(unsigned int opcode)
                if (curi->smode != immi) {
                        uses_cmov;
                        start_brace();
-                       comprintf("\tint zero = scratchie++;\n");
+                       comprintf("\tint cdata = scratchie++;\n");
                        comprintf("\tint tmpcnt = scratchie++;\n");
-                       comprintf("\tint minus1 = scratchie++;\n");
-                       comprintf("\tint cdata = minus1;\n");
-                       comprintf("\tmov_l_rr(tmpcnt,cnt);\n");
-                       comprintf("\tand_l_ri(tmpcnt,63);\n");
-                       comprintf("\tmov_l_ri(zero, 0);\n");
-                       comprintf("\tmov_l_ri(minus1, -1);\n");
+                       comprintf("\tint setval = scratchie++;\n");
+                       if (!noflags) {
+                               comprintf("\tint odata = scratchie++;\n");
+                       }
+                       comprintf("\tmov_l_ri(cdata, 0);\n");
+                       comprintf("\tmov_l_ri(setval, 0xffffffff);\n");
+                       // if high bit = 0: setval = 0x00000000, else setval = 0xffffffff
+                       comprintf("\ttest_l_ri(data, 0x%08x);\n", curi->size == sz_byte ? 0x80 : (curi->size == sz_word ? 0x8000 : 0x80000000));
+                       comprintf("\tcmov_l_rr(setval, cdata, NATIVE_CC_EQ);\n");
+                       comprintf("\tmov_l_rr(cdata, setval);\n");
+                       if (!noflags) {
+                               // setval -> odata
+                               comprintf("\tmov_l_rr(odata, setval);\n");
+                       }
+                       comprintf("\tmov_l_rr(tmpcnt, cnt);\n");
+                       comprintf("\tand_l_ri(tmpcnt, 63);\n");
+                       if (!noflags) {
+                               // shift == 0: tmpcnt (0) -> odata (C is always zero)
+                               comprintf("\tcmov_l_rr(odata, tmpcnt, NATIVE_CC_EQ);\n");
+                       }
+
                        switch (curi->size) {
                        case sz_byte:
-                               comprintf("\ttest_b_rr(data,data);\n");
-                               comprintf("\tcmov_l_rr(zero, minus1, NATIVE_CC_MI);\n");
-                               comprintf("\ttest_l_ri(tmpcnt, 0x38);\n");
-                               comprintf("\tmov_l_rr(cdata,data);\n");
-                               comprintf("\tcmov_l_rr(cdata, zero, NATIVE_CC_NE);\n");
-                               comprintf("\tshra_b_rr(cdata,tmpcnt);\n");
-                               comprintf("\tmov_b_rr(data,cdata);\n");
+                               comprintf("\tcmp_b_ri(tmpcnt, 0x08);\n");
+                               // shift > 8: setval -> cdata
+                               comprintf("\tcmov_l_rr(cdata, setval, NATIVE_CC_HI);\n");
+                               // shift <= 8: data -> cdata ("normal" shift)
+                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_LS);\n");
+                               comprintf("\tshra_b_rr(cdata, tmpcnt);\n");                     
                                break;
                        case sz_word:
-                               comprintf("\ttest_w_rr(data,data);\n");
-                               comprintf("\tcmov_l_rr(zero, minus1, NATIVE_CC_MI);\n");
-                               comprintf("\ttest_l_ri(tmpcnt, 0x30);\n");
-                               comprintf("\tmov_l_rr(cdata,data);\n");
-                               comprintf("\tcmov_l_rr(cdata, zero, NATIVE_CC_NE);\n");
-                               comprintf("\tshra_w_rr(cdata,tmpcnt);\n");
-                               comprintf("\tmov_w_rr(data,cdata);\n");
+                               comprintf("\tcmp_b_ri(tmpcnt, 0x10);\n");
+                               // shift > 16: setval -> cdata
+                               comprintf("\tcmov_l_rr(cdata, setval, NATIVE_CC_HI);\n");
+                               // shift <= 16: data -> cdata ("normal" shift)
+                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_LS);\n");
+                               comprintf("\tshra_w_rr(cdata, tmpcnt);\n");
                                break;
                        case sz_long:
-                               comprintf("\ttest_l_rr(data,data);\n");
-                               comprintf("\tcmov_l_rr(zero, minus1, NATIVE_CC_MI);\n");
-                               comprintf("\ttest_l_ri(tmpcnt, 0x20);\n");
-                               comprintf("\tmov_l_rr(cdata,data);\n");
-                               comprintf("\tcmov_l_rr(cdata, zero, NATIVE_CC_NE);\n");
-                               comprintf("\tshra_l_rr(cdata,tmpcnt);\n");
-                               comprintf("\tmov_l_rr(data,cdata);\n");
+                               comprintf("\tcmp_b_ri(tmpcnt, 0x20);\n");
+                               // shift > 32: setval -> cdata
+                               comprintf("\tcmov_l_rr(cdata, setval, NATIVE_CC_HI);\n");
+                               // shift == 32? 0 -> cdata (x86 masks count by 31, 680x0 uses mask 63)
+                               comprintf("\tcmov_l_rr(data, setval, NATIVE_CC_EQ);\n");
+                               // shift <= 32: data -> cdata ("normal" shift)
+                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_LS);\n");
+                               comprintf("\tshra_l_rr(cdata, tmpcnt);\n");
                                break;
                        default: assert(0);
                        }
@@ -2371,12 +2385,22 @@ gen_opcode(unsigned int opcode)
                if (!noflags) {
                        comprintf("\tlive_flags();\n");
                        comprintf("\tend_needflags();\n");
-                       if (curi->smode != immi)
-                               comprintf("\tsetcc_for_cntzero(tmpcnt, data, %d, 0);\n", curi->size == sz_byte ? 1 : curi->size == sz_word ? 2 : 4);
-                       else
+                       if (curi->smode != immi) {
+                               comprintf("\tsetcc_for_cntzero(tmpcnt, cdata, odata, 0, %d, 0);\n", curi->size);
+                       } else {
                                comprintf("\tduplicate_carry();\n");
+                       }
                        comprintf("if (!(needed_flags & FLAG_CZNV)) dont_care_flags();\n");
                }
+               if (curi->smode != immi) {
+                       switch (curi->size) {
+                       case sz_byte: comprintf("\tmov_b_rr(data, cdata);\n"); break;
+                       case sz_word: comprintf("\tmov_w_rr(data, cdata);\n"); break;
+                       case sz_long: comprintf("\tmov_l_rr(data, cdata);\n"); break;
+                       default: assert(0);
+                       }
+               }
+
                genastore("data", curi->dmode, "dstreg", curi->size, "data");
                break;
 
@@ -2414,28 +2438,51 @@ gen_opcode(unsigned int opcode)
                        uses_cmov;
                        start_brace();
                        comprintf("\tint cdata = scratchie++;\n");
-                       comprintf("\tint tmpcnt=scratchie++;\n");
-                       comprintf("\tmov_l_rr(tmpcnt,cnt);\n");
-                       comprintf("\tand_l_ri(tmpcnt,63);\n");
+                       comprintf("\tint tmpcnt = scratchie++;\n");
+                       if (!noflags) {
+                               comprintf("\tint odata = scratchie++;\n");
+                               comprintf("\tmov_l_rr(odata, data);\n");
+                       }
                        comprintf("\tmov_l_ri(cdata, 0);\n");
+                       comprintf("\tmov_l_rr(tmpcnt, cnt);\n");
+                       comprintf("\tand_l_ri(tmpcnt, 63);\n");
+                       if (!noflags) {
+                               // shift == 0? cdata (0) -> odata (C is always zero)
+                               comprintf("\tcmov_l_rr(odata, cdata, NATIVE_CC_EQ);\n");
+                       }
+
                        switch (curi->size) {
                        case sz_byte:
-                               comprintf("\ttest_l_ri(tmpcnt, 0x38);\n");
-                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_EQ);\n");
-                               comprintf("\tshll_b_rr(cdata,tmpcnt);\n");
-                               comprintf("\tmov_b_rr(data, cdata);\n");
+                               // shift > 8? 0 -> odata (C is always zero)
+                               comprintf("\tcmp_b_ri(tmpcnt, 0x08);\n");
+                               if (!noflags) {
+                                       comprintf("\tcmov_l_rr(odata, cdata, NATIVE_CC_HI);\n");
+                               }
+                               // shift <= 8? cdata -> cdata ("normal" shift)
+                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_LS);\n");
+                               comprintf("\tshll_b_rr(cdata, tmpcnt);\n");
                                break;
                        case sz_word:
-                               comprintf("\ttest_l_ri(tmpcnt, 0x30);\n");
-                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_EQ);\n");
-                               comprintf("\tshll_w_rr(cdata,tmpcnt);\n");
-                               comprintf("\tmov_w_rr(data, cdata);\n");
+                               // shift > 16? 0 -> odata (C is always zero)
+                               comprintf("\tcmp_b_ri(tmpcnt, 0x10);\n");
+                               if (!noflags) {
+                                       comprintf("\tcmov_l_rr(odata, cdata, NATIVE_CC_HI);\n");
+                               }
+                               // shift <= 16? cdata -> cdata ("normal" shift)
+                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_LS);\n");
+                               comprintf("\tshll_w_rr(cdata, tmpcnt);\n");
                                break;
                        case sz_long:
-                               comprintf("\ttest_l_ri(tmpcnt, 0x20);\n");
-                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_EQ);\n");
-                               comprintf("\tshll_l_rr(cdata,tmpcnt);\n");
-                               comprintf("\tmov_l_rr(data, cdata);\n");
+                               // shift > 32? 0 -> odata (C is always zero)
+                               comprintf("\tcmp_b_ri(tmpcnt, 0x20);\n");
+                               if (!noflags) {
+                                       comprintf("\tcmov_l_rr(odata, cdata, NATIVE_CC_HI);\n");
+                               }
+                               // shift == 32? 0 -> cdata (x86 masks count by 31, 680x0 uses mask 63)
+                               comprintf("\tcmov_l_rr(data, cdata, NATIVE_CC_EQ);\n");
+                               // shift <= 32? cdata -> cdata ("normal" shift)
+                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_LS);\n");
+                               comprintf("\tshll_l_rr(cdata, tmpcnt);\n");
                                break;
                        default: assert(0);
                        }
@@ -2452,12 +2499,22 @@ gen_opcode(unsigned int opcode)
                if (!noflags) {
                        comprintf("\tlive_flags();\n");
                        comprintf("\tend_needflags();\n");
-                       if (curi->smode != immi)
-                               comprintf("\tsetcc_for_cntzero(tmpcnt, data, %d, 0);\n", curi->size == sz_byte ? 1 : curi->size == sz_word ? 2 : 4);
-                       else
+                       if (curi->smode != immi) {
+                               comprintf("\tsetcc_for_cntzero(tmpcnt, cdata, odata, 0, %d, 1);\n", curi->size);
+                       } else {
+                               comprintf("\tclear_overflow();\n");
                                comprintf("\tduplicate_carry();\n");
+                       }
                        comprintf("if (!(needed_flags & FLAG_CZNV)) dont_care_flags();\n");
                }
+               if (curi->smode != immi) {
+                       switch (curi->size) {
+                       case sz_byte: comprintf("\tmov_b_rr(data, cdata);\n"); break;
+                       case sz_word: comprintf("\tmov_w_rr(data, cdata);\n"); break;
+                       case sz_long: comprintf("\tmov_l_rr(data, cdata);\n"); break;
+                       default: assert(0);
+                       }
+               }
                genastore("data", curi->dmode, "dstreg", curi->size, "data");
                break;
 
@@ -2485,30 +2542,52 @@ gen_opcode(unsigned int opcode)
                if (curi->smode != immi) {
                        uses_cmov;
                        start_brace();
-                       comprintf("\tint cdata=scratchie++;\n");
-                       comprintf("\tint tmpcnt=scratchie++;\n");
-                       comprintf("\tmov_l_rr(tmpcnt,cnt);\n");
-                       comprintf("\tand_l_ri(tmpcnt,63);\n");
+                       comprintf("\tint cdata = scratchie++;\n");
+                       comprintf("\tint tmpcnt = scratchie++;\n");
+                       if (!noflags) {
+                               comprintf("\tint odata = scratchie++;\n");
+                               comprintf("\tmov_l_rr(odata, data);\n");
+                       }
                        comprintf("\tmov_l_ri(cdata, 0);\n");
-
+                       comprintf("\tmov_l_rr(tmpcnt, cnt);\n");
+                       comprintf("\tand_l_ri(tmpcnt, 63);\n");
+                       if (!noflags) {
+                               // shift == 0? cdata (0) -> odata (C is always zero)
+                               comprintf("\tcmov_l_rr(odata, cdata, NATIVE_CC_EQ);\n");
+                       }
+                       
                        switch (curi->size) {
                        case sz_byte:
-                               comprintf("\ttest_l_ri(tmpcnt, 0x38);\n");
-                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_EQ);\n");
-                               comprintf("\tshrl_b_rr(cdata,tmpcnt);\n");
-                               comprintf("\tmov_b_rr(data, cdata);\n");
+                               // shift > 8? 0 -> odata (C is always zero)
+                               comprintf("\tcmp_b_ri(tmpcnt, 0x08);\n");
+                               if (!noflags) {
+                                       comprintf("\tcmov_l_rr(odata, cdata, NATIVE_CC_HI);\n");
+                               }
+                               // shift <= 8? cdata -> cdata ("normal" shift)
+                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_LS);\n");
+                               comprintf("\tshrl_b_rr(cdata, tmpcnt);\n");
                                break;
                        case sz_word:
-                               comprintf("\ttest_l_ri(tmpcnt, 0x30);\n");
-                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_EQ);\n");
-                               comprintf("\tshrl_w_rr(cdata,tmpcnt);\n");
-                               comprintf("\tmov_w_rr(data, cdata);\n");
+                               // shift > 16? 0 -> odata (C is always zero)
+                               comprintf("\tcmp_b_ri(tmpcnt, 0x10);\n");
+                               if (!noflags) {
+                                       comprintf("\tcmov_l_rr(odata, cdata, NATIVE_CC_HI);\n");
+                               }
+                               // shift <= 16? cdata -> cdata ("normal" shift)
+                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_LS);\n");
+                               comprintf("\tshrl_w_rr(cdata, tmpcnt);\n");
                                break;
                        case sz_long:
-                               comprintf("\ttest_l_ri(tmpcnt, 0x20);\n");
-                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_EQ);\n");
+                               // shift > 32? 0 -> odata (C is always zero)
+                               comprintf("\tcmp_b_ri(tmpcnt, 0x20);\n");
+                               if (!noflags) {
+                                       comprintf("\tcmov_l_rr(odata, cdata, NATIVE_CC_HI);\n");
+                               }
+                               // shift == 32? 0 -> cdata (x86 masks count by 31, 680x0 uses mask 63)
+                               comprintf("\tcmov_l_rr(data, cdata, NATIVE_CC_EQ);\n");
+                               // shift <= 32? cdata -> cdata ("normal" shift)
+                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_LS);\n");
                                comprintf("\tshrl_l_rr(cdata, tmpcnt);\n");
-                               comprintf("\tmov_l_rr(data, cdata);\n");
                                break;
                        default: assert(0);
                        }
@@ -2526,12 +2605,20 @@ gen_opcode(unsigned int opcode)
                        comprintf("\tlive_flags();\n");
                        comprintf("\tend_needflags();\n");
                        if (curi->smode != immi) {
-                               comprintf("\tsetcc_for_cntzero(tmpcnt, data, %d, 0);\n", curi->size == sz_byte ? 1 : curi->size == sz_word ? 2 : 4);
+                               comprintf("\tsetcc_for_cntzero(tmpcnt, cdata, odata, %d, %d, 1);\n", curi->size == sz_byte ? 7 : curi->size == sz_word ? 15 : 31, curi->size);
                        } else {
                                comprintf("\tduplicate_carry();\n");
                        }
                        comprintf("if (!(needed_flags & FLAG_CZNV)) dont_care_flags();\n");
                }
+               if (curi->smode != immi) {
+                       switch (curi->size) {
+                       case sz_byte: comprintf("\tmov_b_rr(data, cdata);\n"); break;
+                       case sz_word: comprintf("\tmov_w_rr(data, cdata);\n"); break;
+                       case sz_long: comprintf("\tmov_l_rr(data, cdata);\n"); break;
+                       default: assert(0);
+                       }
+               }
                genastore("data", curi->dmode, "dstreg", curi->size, "data");
                break;
 
@@ -2561,28 +2648,50 @@ gen_opcode(unsigned int opcode)
                        start_brace();
                        comprintf("\tint cdata = scratchie++;\n");
                        comprintf("\tint tmpcnt = scratchie++;\n");
-                       comprintf("\tmov_l_rr(tmpcnt,cnt);\n");
-                       comprintf("\tand_l_ri(tmpcnt,63);\n");
+                       if (!noflags) {
+                               comprintf("\tint odata = scratchie++;\n");
+                               comprintf("\tmov_l_rr(odata, data);\n");
+                       }
                        comprintf("\tmov_l_ri(cdata, 0);\n");
+                       comprintf("\tmov_l_rr(tmpcnt, cnt);\n");
+                       comprintf("\tand_l_ri(tmpcnt, 63);\n");
+                       if (!noflags) {
+                               // shift == 0? cdata (0) -> odata (C is always zero)
+                               comprintf("\tcmov_l_rr(odata, cdata, NATIVE_CC_EQ);\n");
+                       }
 
                        switch (curi->size) {
                        case sz_byte:
-                               comprintf("\ttest_l_ri(tmpcnt, 0x38);\n");
-                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_EQ);\n");
-                               comprintf("\tshll_b_rr(cdata,tmpcnt);\n");
-                               comprintf("\tmov_b_rr(data, cdata);\n");
+                               // shift > 8? 0 -> odata (C is always zero)
+                               comprintf("\tcmp_b_ri(tmpcnt, 0x08);\n");
+                               if (!noflags) {
+                                       comprintf("\tcmov_l_rr(odata, cdata, NATIVE_CC_HI);\n");
+                               }
+                               // shift <= 8? cdata -> cdata ("normal" shift)
+                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_LS);\n");
+                               comprintf("\tshll_b_rr(cdata, tmpcnt);\n");
                                break;
                        case sz_word:
-                               comprintf("\ttest_l_ri(tmpcnt, 0x30);\n");
-                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_EQ);\n");
-                               comprintf("\tshll_w_rr(cdata,tmpcnt);\n");
-                               comprintf("\tmov_w_rr(data, cdata);\n");
+                               // shift > 16? 0 -> odata (C is always zero)
+                               comprintf("\tcmp_b_ri(tmpcnt, 0x10);\n");
+                               if (!noflags) {
+                                       comprintf("\tcmov_l_rr(odata, cdata, NATIVE_CC_HI);\n");
+                               }
+                               // shift <= 16? cdata -> cdata ("normal" shift)
+                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_LS);\n");
+                               comprintf("\tshll_w_rr(cdata, tmpcnt);\n");
                                break;
                        case sz_long:
-                               comprintf("\ttest_l_ri(tmpcnt, 0x20);\n");
-                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_EQ);\n");
-                               comprintf("\tshll_l_rr(cdata,tmpcnt);\n");
-                               comprintf("\tmov_l_rr(data, cdata);\n");
+                               // shift > 32? 0 -> odata (C is always zero)
+                               comprintf("\tcmp_b_ri(tmpcnt, 0x20);\n");
+                               if (!noflags) {
+                                       comprintf("\tcmov_l_rr(odata, cdata, NATIVE_CC_HI);\n");
+                               }
+                               // shift == 32? 0 -> cdata (x86 masks count by 31, 680x0 uses mask 63)
+                               comprintf("\tcmov_l_rr(data, cdata, NATIVE_CC_EQ);\n");
+                               // shift <= 32? cdata -> cdata ("normal" shift)
+                               comprintf("\tcmov_l_rr(cdata, data, NATIVE_CC_LS);\n");
+                               comprintf("\tshll_l_rr(cdata, tmpcnt);\n");
                                break;
                        default: assert(0);
                        }
@@ -2600,13 +2709,21 @@ gen_opcode(unsigned int opcode)
                        comprintf("\tlive_flags();\n");
                        comprintf("\tend_needflags();\n");
                        if (curi->smode != immi) {
-                               comprintf("\tsetcc_for_cntzero(tmpcnt, data, %d, 1);\n", curi->size == sz_byte ? 1 : curi->size == sz_word ? 2 : 4);
+                               comprintf("\tsetcc_for_cntzero(tmpcnt, cdata, odata, 0, %d, 1);\n", curi->size);
                        } else {
                                comprintf("\tclear_overflow();\n");
                                comprintf("\tduplicate_carry();\n");
                        }
                        comprintf("if (!(needed_flags & FLAG_CZNV)) dont_care_flags();\n");
                }
+               if (curi->smode != immi) {
+                       switch (curi->size) {
+                       case sz_byte: comprintf("\tmov_b_rr(data, cdata);\n"); break;
+                       case sz_word: comprintf("\tmov_w_rr(data, cdata);\n"); break;
+                       case sz_long: comprintf("\tmov_l_rr(data, cdata);\n"); break;
+                       default: assert(0);
+                       }
+               }
                genastore("data", curi->dmode, "dstreg", curi->size, "data");
                break;