]> git.unchartedbackwaters.co.uk Git - francis/winuae.git/commitdiff
PCem Voodoo emulation
authorToni Wilen <twilen@winuae.net>
Sat, 19 Dec 2020 20:31:24 +0000 (22:31 +0200)
committerToni Wilen <twilen@winuae.net>
Sat, 19 Dec 2020 20:31:24 +0000 (22:31 +0200)
27 files changed:
pcem/vid_voodoo.cpp [new file with mode: 0644]
pcem/vid_voodoo.h [new file with mode: 0644]
pcem/vid_voodoo_banshee.cpp [new file with mode: 0644]
pcem/vid_voodoo_banshee.h [new file with mode: 0644]
pcem/vid_voodoo_banshee_blitter.cpp [new file with mode: 0644]
pcem/vid_voodoo_banshee_blitter.h [new file with mode: 0644]
pcem/vid_voodoo_blitter.cpp [new file with mode: 0644]
pcem/vid_voodoo_blitter.h [new file with mode: 0644]
pcem/vid_voodoo_codegen_x86-64.h [new file with mode: 0644]
pcem/vid_voodoo_codegen_x86.h [new file with mode: 0644]
pcem/vid_voodoo_common.h [new file with mode: 0644]
pcem/vid_voodoo_display.cpp [new file with mode: 0644]
pcem/vid_voodoo_display.h [new file with mode: 0644]
pcem/vid_voodoo_dither.h [new file with mode: 0644]
pcem/vid_voodoo_fb.cpp [new file with mode: 0644]
pcem/vid_voodoo_fb.h [new file with mode: 0644]
pcem/vid_voodoo_fifo.cpp [new file with mode: 0644]
pcem/vid_voodoo_fifo.h [new file with mode: 0644]
pcem/vid_voodoo_reg.cpp [new file with mode: 0644]
pcem/vid_voodoo_reg.h [new file with mode: 0644]
pcem/vid_voodoo_regs.h [new file with mode: 0644]
pcem/vid_voodoo_render.cpp [new file with mode: 0644]
pcem/vid_voodoo_render.h [new file with mode: 0644]
pcem/vid_voodoo_setup.cpp [new file with mode: 0644]
pcem/vid_voodoo_setup.h [new file with mode: 0644]
pcem/vid_voodoo_texture.cpp [new file with mode: 0644]
pcem/vid_voodoo_texture.h [new file with mode: 0644]

diff --git a/pcem/vid_voodoo.cpp b/pcem/vid_voodoo.cpp
new file mode 100644 (file)
index 0000000..ed4553c
--- /dev/null
@@ -0,0 +1,1470 @@
+#include <stdlib.h>
+#include <stddef.h>
+#include <math.h>
+#include "ibm.h"
+#include "device.h"
+#include "mem.h"
+#include "pci.h"
+#include "thread.h"
+#include "timer.h"
+#include "video.h"
+#include "vid_svga.h"
+#include "vid_voodoo.h"
+#include "vid_voodoo_common.h"
+#include "vid_voodoo_blitter.h"
+#include "vid_voodoo_display.h"
+#include "vid_voodoo_dither.h"
+#include "vid_voodoo_fb.h"
+#include "vid_voodoo_fifo.h"
+#include "vid_voodoo_reg.h"
+#include "vid_voodoo_regs.h"
+#include "vid_voodoo_render.h"
+#include "vid_voodoo_texture.h"
+
+
+rgba8_t rgb332[0x100], ai44[0x100], rgb565[0x10000], argb1555[0x10000], argb4444[0x10000], ai88[0x10000];
+
+
+int tris = 0;
+
+static uint64_t status_time = 0;
+
+
+void voodoo_recalc(voodoo_t *voodoo)
+{
+        uint32_t buffer_offset = ((voodoo->fbiInit2 >> 11) & 511) * 4096;
+
+        if (voodoo->type >= VOODOO_BANSHEE)
+                return;
+
+        voodoo->params.front_offset = voodoo->disp_buffer*buffer_offset;
+        voodoo->back_offset = voodoo->draw_buffer*buffer_offset;
+
+        voodoo->buffer_cutoff = TRIPLE_BUFFER ? (buffer_offset * 4) : (buffer_offset * 3);
+        if (TRIPLE_BUFFER)
+                voodoo->params.aux_offset = buffer_offset * 3;
+        else
+                voodoo->params.aux_offset = buffer_offset * 2;
+
+        switch (voodoo->lfbMode & LFB_WRITE_MASK)
+        {
+                case LFB_WRITE_FRONT:
+                voodoo->fb_write_offset = voodoo->params.front_offset;
+                voodoo->fb_write_buffer = voodoo->disp_buffer;
+                break;
+                case LFB_WRITE_BACK:
+                voodoo->fb_write_offset = voodoo->back_offset;
+                voodoo->fb_write_buffer = voodoo->draw_buffer;
+                break;
+
+                default:
+                /*BreakNeck sets invalid LFB write buffer select*/
+                voodoo->fb_write_offset = voodoo->params.front_offset;
+                break;
+        }
+
+        switch (voodoo->lfbMode & LFB_READ_MASK)
+        {
+                case LFB_READ_FRONT:
+                voodoo->fb_read_offset = voodoo->params.front_offset;
+                break;
+                case LFB_READ_BACK:
+                voodoo->fb_read_offset = voodoo->back_offset;
+                break;
+                case LFB_READ_AUX:
+                voodoo->fb_read_offset = voodoo->params.aux_offset;
+                break;
+
+                default:
+                fatal("voodoo_recalc : unknown lfb source\n");
+        }
+
+        switch (voodoo->params.fbzMode & FBZ_DRAW_MASK)
+        {
+                case FBZ_DRAW_FRONT:
+                voodoo->params.draw_offset = voodoo->params.front_offset;
+                voodoo->fb_draw_buffer = voodoo->disp_buffer;
+                break;
+                case FBZ_DRAW_BACK:
+                voodoo->params.draw_offset = voodoo->back_offset;
+                voodoo->fb_draw_buffer = voodoo->draw_buffer;
+                break;
+
+                default:
+                fatal("voodoo_recalc : unknown draw buffer\n");
+        }
+
+        voodoo->block_width = ((voodoo->fbiInit1 >> 4) & 15) * 2;
+        if (voodoo->fbiInit6 & (1 << 30))
+                voodoo->block_width += 1;
+        if (voodoo->fbiInit1 & (1 << 24))
+                voodoo->block_width += 32;
+        voodoo->row_width = voodoo->block_width * 32 * 2;
+        voodoo->params.row_width = voodoo->row_width;
+        voodoo->aux_row_width = voodoo->row_width;
+        voodoo->params.aux_row_width = voodoo->aux_row_width;
+
+/*        pclog("voodoo_recalc : front_offset %08X  back_offset %08X  aux_offset %08X draw_offset %08x\n", voodoo->params.front_offset, voodoo->back_offset, voodoo->params.aux_offset, voodoo->params.draw_offset);
+        pclog("                fb_read_offset %08X  fb_write_offset %08X  row_width %i  %08x %08x\n", voodoo->fb_read_offset, voodoo->fb_write_offset, voodoo->row_width, voodoo->lfbMode, voodoo->params.fbzMode);*/
+}
+
+
+static uint16_t voodoo_readw(uint32_t addr, void *p)
+{
+        voodoo_t *voodoo = (voodoo_t *)p;
+        
+        addr &= 0xffffff;
+
+        cycles -= voodoo->read_time;
+        
+        if ((addr & 0xc00000) == 0x400000) /*Framebuffer*/
+        {
+                if (SLI_ENABLED)
+                {
+                        voodoo_set_t *set = voodoo->set;
+                        int y = (addr >> 11) & 0x3ff;
+                
+                        if (y & 1)
+                                voodoo = set->voodoos[1];
+                        else
+                                voodoo = set->voodoos[0];
+                }
+
+                voodoo->flush = 1;
+                while (!FIFO_EMPTY)
+                {
+                        voodoo_wake_fifo_thread_now(voodoo);
+                        thread_wait_event(voodoo->fifo_not_full_event, 1);
+                }
+                voodoo_wait_for_render_thread_idle(voodoo);
+                voodoo->flush = 0;
+                
+                return voodoo_fb_readw(addr, voodoo);
+        }
+
+        return 0xffff;
+}
+
+
+static uint32_t voodoo_readl(uint32_t addr, void *p)
+{
+        voodoo_t *voodoo = (voodoo_t *)p;
+        uint32_t temp;
+        int fifo_size;
+        voodoo->rd_count++;
+        addr &= 0xffffff;
+        
+        cycles -= voodoo->read_time;
+
+        if (addr & 0x800000) /*Texture*/
+        {
+        }
+        else if (addr & 0x400000) /*Framebuffer*/
+        {
+                if (SLI_ENABLED)
+                {
+                        voodoo_set_t *set = voodoo->set;
+                        int y = (addr >> 11) & 0x3ff;
+                
+                        if (y & 1)
+                                voodoo = set->voodoos[1];
+                        else
+                                voodoo = set->voodoos[0];
+                }
+
+                voodoo->flush = 1;
+                while (!FIFO_EMPTY)
+                {
+                        voodoo_wake_fifo_thread_now(voodoo);
+                        thread_wait_event(voodoo->fifo_not_full_event, 1);
+                }
+                voodoo_wait_for_render_thread_idle(voodoo);
+                voodoo->flush = 0;
+                
+                temp = voodoo_fb_readl(addr, voodoo);
+        }
+        else switch (addr & 0x3fc)
+        {
+                case SST_status:
+                {
+                        int fifo_entries = FIFO_ENTRIES;
+                        int swap_count = voodoo->swap_count;
+                        int written = voodoo->cmd_written + voodoo->cmd_written_fifo;
+                        int busy = (written - voodoo->cmd_read) || (voodoo->cmdfifo_depth_rd != voodoo->cmdfifo_depth_wr);
+
+                        if (SLI_ENABLED && voodoo->type != VOODOO_2)
+                        {
+                                voodoo_t *voodoo_other = (voodoo == voodoo->set->voodoos[0]) ? voodoo->set->voodoos[1] : voodoo->set->voodoos[0];
+                                int other_written = voodoo_other->cmd_written + voodoo_other->cmd_written_fifo;
+                                                        
+                                if (voodoo_other->swap_count > swap_count)
+                                        swap_count = voodoo_other->swap_count;
+                                if ((voodoo_other->fifo_write_idx - voodoo_other->fifo_read_idx) > fifo_entries)
+                                        fifo_entries = voodoo_other->fifo_write_idx - voodoo_other->fifo_read_idx;
+                                if ((other_written - voodoo_other->cmd_read) ||
+                                    (voodoo_other->cmdfifo_depth_rd != voodoo_other->cmdfifo_depth_wr))
+                                        busy = 1;
+                                if (!voodoo_other->voodoo_busy)
+                                        voodoo_wake_fifo_thread(voodoo_other);
+                        }
+                        
+                        fifo_size = 0xffff - fifo_entries;
+                        temp = fifo_size << 12;
+                        if (fifo_size < 0x40)
+                                temp |= fifo_size;
+                        else
+                                temp |= 0x3f;
+                        if (swap_count < 7)
+                                temp |= (swap_count << 28);
+                        else
+                                temp |= (7 << 28);
+                        if (!voodoo->v_retrace)
+                                temp |= 0x40;
+
+                        if (busy)
+                                temp |= 0x380; /*Busy*/
+
+                        if (!voodoo->voodoo_busy)
+                                voodoo_wake_fifo_thread(voodoo);
+                }
+                break;
+
+                case SST_fbzColorPath:
+                voodoo_flush(voodoo);
+                temp = voodoo->params.fbzColorPath;
+                break;
+                case SST_fogMode:
+                voodoo_flush(voodoo);
+                temp = voodoo->params.fogMode;
+                break;
+                case SST_alphaMode:
+                voodoo_flush(voodoo);
+                temp = voodoo->params.alphaMode;
+                break;
+                case SST_fbzMode:
+                voodoo_flush(voodoo);
+                temp = voodoo->params.fbzMode;
+                break;                        
+                case SST_lfbMode:
+                voodoo_flush(voodoo);
+                temp = voodoo->lfbMode;
+                break;
+                case SST_clipLeftRight:
+                voodoo_flush(voodoo);
+                temp = voodoo->params.clipRight | (voodoo->params.clipLeft << 16);
+                break;
+                case SST_clipLowYHighY:
+                voodoo_flush(voodoo);
+                temp = voodoo->params.clipHighY | (voodoo->params.clipLowY << 16);
+                break;
+
+                case SST_stipple:
+                voodoo_flush(voodoo);
+                temp = voodoo->params.stipple;
+                break;
+                case SST_color0:
+                voodoo_flush(voodoo);
+                temp = voodoo->params.color0;
+                break;
+                case SST_color1:
+                voodoo_flush(voodoo);
+                temp = voodoo->params.color1;
+                break;
+                
+                case SST_fbiPixelsIn:
+                temp = voodoo->fbiPixelsIn & 0xffffff;
+                break;
+                case SST_fbiChromaFail:
+                temp = voodoo->fbiChromaFail & 0xffffff;
+                break;
+                case SST_fbiZFuncFail:
+                temp = voodoo->fbiZFuncFail & 0xffffff;
+                break;
+                case SST_fbiAFuncFail:
+                temp = voodoo->fbiAFuncFail & 0xffffff;
+                break;
+                case SST_fbiPixelsOut:
+                temp = voodoo->fbiPixelsOut & 0xffffff;
+                break;
+
+                case SST_fbiInit4:
+                temp = voodoo->fbiInit4;
+                break;
+                case SST_fbiInit0:
+                temp = voodoo->fbiInit0;
+                break;
+                case SST_fbiInit1:
+                temp = voodoo->fbiInit1;
+                break;              
+                case SST_fbiInit2:
+                if (voodoo->initEnable & 0x04)
+                        temp = voodoo->dac_readdata;
+                else
+                        temp = voodoo->fbiInit2;
+                break;
+                case SST_fbiInit3:
+                temp = voodoo->fbiInit3 | (1 << 10) | (2 << 8);
+                break;
+
+                case SST_vRetrace:
+                temp = voodoo->line & 0x1fff;
+                break;
+                case SST_hvRetrace:
+                {
+                        uint32_t line_time = (uint32_t)(voodoo->line_time >> 32);
+                        uint32_t diff = (timer_get_ts_int(&voodoo->timer) > (tsc & 0xffffffff)) ? (timer_get_ts_int(&voodoo->timer) - (tsc & 0xffffffff)) : 0;
+                        uint32_t pre_div = diff * voodoo->h_total;
+                        uint32_t post_div = pre_div / line_time;
+                        uint32_t h_pos = (voodoo->h_total - 1) - post_div;
+                        
+                        if (h_pos >= voodoo->h_total)
+                                h_pos = 0;
+                        
+                        temp = voodoo->line & 0x1fff;
+                        temp |= (h_pos << 16);
+                }
+                break;
+
+                case SST_fbiInit5:
+                temp = voodoo->fbiInit5 & ~0x1ff;
+                break;
+                case SST_fbiInit6:
+                temp = voodoo->fbiInit6;
+                break;
+                case SST_fbiInit7:
+                temp = voodoo->fbiInit7 & ~0xff;
+                break;
+
+                case SST_cmdFifoBaseAddr:
+                temp = voodoo->cmdfifo_base >> 12;
+                temp |= (voodoo->cmdfifo_end >> 12) << 16;
+                break;
+                
+                case SST_cmdFifoRdPtr:
+                temp = voodoo->cmdfifo_rp;
+                break;
+                case SST_cmdFifoAMin:
+                temp = voodoo->cmdfifo_amin;
+                break;
+                case SST_cmdFifoAMax:
+                temp = voodoo->cmdfifo_amax;
+                break;
+                case SST_cmdFifoDepth:
+                temp = voodoo->cmdfifo_depth_wr - voodoo->cmdfifo_depth_rd;
+                break;
+                
+                default:
+                pclog("voodoo_readl  : bad addr %08X\n", addr);
+                temp = 0xffffffff;
+        }
+        
+        return temp;
+}
+
+static void voodoo_writew(uint32_t addr, uint16_t val, void *p)
+{
+        voodoo_t *voodoo = (voodoo_t *)p;
+        voodoo->wr_count++;
+        addr &= 0xffffff;
+        
+        cycles -= voodoo->write_time;
+
+        if ((addr & 0xc00000) == 0x400000) /*Framebuffer*/
+                voodoo_queue_command(voodoo, addr | FIFO_WRITEW_FB, val);
+}
+
+static void voodoo_writel(uint32_t addr, uint32_t val, void *p)
+{
+        voodoo_t *voodoo = (voodoo_t *)p;
+
+        voodoo->wr_count++;
+
+        addr &= 0xffffff;
+        
+        if (addr == voodoo->last_write_addr+4)
+                cycles -= voodoo->burst_time;
+        else
+                cycles -= voodoo->write_time;
+        voodoo->last_write_addr = addr;
+
+        if (addr & 0x800000) /*Texture*/
+        {
+                voodoo->tex_count++;
+                voodoo_queue_command(voodoo, addr | FIFO_WRITEL_TEX, val);
+        }
+        else if (addr & 0x400000) /*Framebuffer*/
+        {
+                voodoo_queue_command(voodoo, addr | FIFO_WRITEL_FB, val);
+        }
+        else if ((addr & 0x200000) && (voodoo->fbiInit7 & FBIINIT7_CMDFIFO_ENABLE))
+        {
+//                pclog("Write CMDFIFO %08x(%08x) %08x  %08x\n", addr, voodoo->cmdfifo_base + (addr & 0x3fffc), val, (voodoo->cmdfifo_base + (addr & 0x3fffc)) & voodoo->fb_mask);
+                *(uint32_t *)&voodoo->fb_mem[(voodoo->cmdfifo_base + (addr & 0x3fffc)) & voodoo->fb_mask] = val;
+                voodoo->cmdfifo_depth_wr++;
+                if ((voodoo->cmdfifo_depth_wr - voodoo->cmdfifo_depth_rd) < 20)
+                        voodoo_wake_fifo_thread(voodoo);
+        }
+        else switch (addr & 0x3fc)
+        {
+                case SST_intrCtrl:
+                fatal("intrCtrl write %08x\n", val);
+                break;
+
+                case SST_userIntrCMD:
+                fatal("userIntrCMD write %08x\n", val);
+                break;
+                
+                case SST_swapbufferCMD:
+                voodoo->cmd_written++;
+                thread_lock_mutex(voodoo->swap_mutex);
+                voodoo->swap_count++;
+                thread_unlock_mutex(voodoo->swap_mutex);
+                if (voodoo->fbiInit7 & FBIINIT7_CMDFIFO_ENABLE)
+                        return;
+                voodoo_queue_command(voodoo, addr | FIFO_WRITEL_REG, val);
+                if (!voodoo->voodoo_busy)
+                        voodoo_wake_fifo_threads(voodoo->set, voodoo);
+                break;
+                case SST_triangleCMD:
+                if (voodoo->fbiInit7 & FBIINIT7_CMDFIFO_ENABLE)
+                        return;
+                voodoo->cmd_written++;
+                voodoo_queue_command(voodoo, addr | FIFO_WRITEL_REG, val);
+                if (!voodoo->voodoo_busy)
+                        voodoo_wake_fifo_threads(voodoo->set, voodoo);
+                break;
+                case SST_ftriangleCMD:
+                if (voodoo->fbiInit7 & FBIINIT7_CMDFIFO_ENABLE)
+                        return;
+                voodoo->cmd_written++;
+                voodoo_queue_command(voodoo, addr | FIFO_WRITEL_REG, val);
+                if (!voodoo->voodoo_busy)
+                        voodoo_wake_fifo_threads(voodoo->set, voodoo);
+                break;
+                case SST_fastfillCMD:
+                if (voodoo->fbiInit7 & FBIINIT7_CMDFIFO_ENABLE)
+                        return;
+                voodoo->cmd_written++;
+                voodoo_queue_command(voodoo, addr | FIFO_WRITEL_REG, val);
+                if (!voodoo->voodoo_busy)
+                        voodoo_wake_fifo_threads(voodoo->set, voodoo);
+                break;
+                case SST_nopCMD:
+                if (voodoo->fbiInit7 & FBIINIT7_CMDFIFO_ENABLE)
+                        return;
+                voodoo->cmd_written++;
+                voodoo_queue_command(voodoo, addr | FIFO_WRITEL_REG, val);
+                if (!voodoo->voodoo_busy)
+                        voodoo_wake_fifo_threads(voodoo->set, voodoo);
+                break;
+                        
+                case SST_fbiInit4:
+                if (voodoo->initEnable & 0x01)
+                {
+                        voodoo->fbiInit4 = val;
+                        voodoo->read_time = pci_nonburst_time + pci_burst_time * ((voodoo->fbiInit4 & 1) ? 2 : 1);
+//                        pclog("fbiInit4 write %08x - read_time=%i\n", val, voodoo->read_time);
+                }
+                break;
+                case SST_backPorch:
+                voodoo->backPorch = val;
+                break;
+                case SST_videoDimensions:
+                voodoo->videoDimensions = val;
+                voodoo->h_disp = (val & 0xfff) + 1;
+                voodoo->v_disp = (val >> 16) & 0xfff;
+                break;
+                case SST_fbiInit0:
+                if (voodoo->initEnable & 0x01)
+                {
+                        voodoo->fbiInit0 = val;
+                        if (voodoo->set->nr_cards == 2)
+                                svga_set_override(voodoo->svga, (voodoo->set->voodoos[0]->fbiInit0 | voodoo->set->voodoos[1]->fbiInit0) & 1);
+                        else
+                                svga_set_override(voodoo->svga, val & 1);
+                        if (val & FBIINIT0_GRAPHICS_RESET)
+                        {
+                                /*Reset display/draw buffer selection. This may not actually
+                                  happen here on a real Voodoo*/
+                                voodoo->disp_buffer = 0;
+                                voodoo->draw_buffer = 1;
+                                voodoo_recalc(voodoo);
+                                voodoo->front_offset = voodoo->params.front_offset;
+                        }
+                }
+                break;
+                case SST_fbiInit1:
+                if (voodoo->initEnable & 0x01)
+                {
+                        if ((voodoo->fbiInit1 & FBIINIT1_VIDEO_RESET) && !(val & FBIINIT1_VIDEO_RESET))
+                        {
+                                voodoo->line = 0;
+                                thread_lock_mutex(voodoo->swap_mutex);
+                                voodoo->swap_count = 0;
+                                thread_unlock_mutex(voodoo->swap_mutex);
+                                voodoo->retrace_count = 0;
+                        }
+                        voodoo->fbiInit1 = (val & ~5) | (voodoo->fbiInit1 & 5);
+                        voodoo->write_time = pci_nonburst_time + pci_burst_time * ((voodoo->fbiInit1 & 2) ? 1 : 0);
+                        voodoo->burst_time = pci_burst_time * ((voodoo->fbiInit1 & 2) ? 2 : 1);
+//                        pclog("fbiInit1 write %08x - write_time=%i burst_time=%i\n", val, voodoo->write_time, voodoo->burst_time);
+                }
+                break;
+                case SST_fbiInit2:
+                if (voodoo->initEnable & 0x01)
+                {
+                        voodoo->fbiInit2 = val;
+                        voodoo_recalc(voodoo);
+                }
+                break;
+                case SST_fbiInit3:
+                if (voodoo->initEnable & 0x01)
+                        voodoo->fbiInit3 = val;
+                break;
+
+                case SST_hSync:
+                voodoo->hSync = val;
+                voodoo->h_total = (val & 0xffff) + (val >> 16);
+                voodoo_pixelclock_update(voodoo);
+                break;
+                case SST_vSync:
+                voodoo->vSync = val;
+                voodoo->v_total = (val & 0xffff) + (val >> 16);
+                break;
+                
+                case SST_clutData:
+                voodoo->clutData[(val >> 24) & 0x3f].b = val & 0xff;
+                voodoo->clutData[(val >> 24) & 0x3f].g = (val >> 8) & 0xff;
+                voodoo->clutData[(val >> 24) & 0x3f].r = (val >> 16) & 0xff;
+                if (val & 0x20000000)
+                {
+                        voodoo->clutData[(val >> 24) & 0x3f].b = 255;
+                        voodoo->clutData[(val >> 24) & 0x3f].g = 255;
+                        voodoo->clutData[(val >> 24) & 0x3f].r = 255;
+                }
+                voodoo->clutData_dirty = 1;
+                break;
+
+                case SST_dacData:
+                voodoo->dac_reg = (val >> 8) & 7;
+                voodoo->dac_readdata = 0xff;
+                if (val & 0x800)
+                {
+//                        pclog("  dacData read %i %02X\n", voodoo->dac_reg, voodoo->dac_data[7]);
+                        if (voodoo->dac_reg == 5)
+                        {
+                                switch (voodoo->dac_data[7])
+                                {
+                                       case 0x01: voodoo->dac_readdata = 0x55; break;
+                                       case 0x07: voodoo->dac_readdata = 0x71; break;
+                                       case 0x0b: voodoo->dac_readdata = 0x79; break;
+                                }
+                        }
+                        else
+                                voodoo->dac_readdata = voodoo->dac_data[voodoo->dac_readdata & 7];
+                }
+                else
+                {
+                        if (voodoo->dac_reg == 5)
+                        {
+                                if (!voodoo->dac_reg_ff)
+                                        voodoo->dac_pll_regs[voodoo->dac_data[4] & 0xf] = (voodoo->dac_pll_regs[voodoo->dac_data[4] & 0xf] & 0xff00) | val;
+                                else
+                                        voodoo->dac_pll_regs[voodoo->dac_data[4] & 0xf] = (voodoo->dac_pll_regs[voodoo->dac_data[4] & 0xf] & 0xff) | (val << 8);
+//                                pclog("Write PLL reg %x %04x\n", voodoo->dac_data[4] & 0xf, voodoo->dac_pll_regs[voodoo->dac_data[4] & 0xf]);
+                                voodoo->dac_reg_ff = !voodoo->dac_reg_ff;
+                                if (!voodoo->dac_reg_ff)
+                                        voodoo->dac_data[4]++;
+
+                        }
+                        else
+                        {
+                                voodoo->dac_data[voodoo->dac_reg] = val & 0xff;
+                                voodoo->dac_reg_ff = 0;
+                        }
+                        voodoo_pixelclock_update(voodoo);
+                }
+                break;
+
+               case SST_scrFilter:
+               if (voodoo->initEnable & 0x01)
+               {
+                       voodoo->scrfilterEnabled = 1;
+                       voodoo->scrfilterThreshold = val;       /* update the threshold values and generate a new lookup table if necessary */
+               
+                       if (val < 1) 
+                               voodoo->scrfilterEnabled = 0;
+                       voodoo_threshold_check(voodoo);         
+                       pclog("Voodoo Filter: %06x\n", val);
+               }
+               break;
+
+                case SST_fbiInit5:
+                if (voodoo->initEnable & 0x01)
+                        voodoo->fbiInit5 = (val & ~0x41e6) | (voodoo->fbiInit5 & 0x41e6);
+                break;
+                case SST_fbiInit6:
+                if (voodoo->initEnable & 0x01)
+                        voodoo->fbiInit6 = val;
+                break;
+                case SST_fbiInit7:
+                if (voodoo->initEnable & 0x01)
+                {
+                        voodoo->fbiInit7 = val;
+                        voodoo->cmdfifo_enabled = val & 0x100;
+                }
+                break;
+
+                case SST_cmdFifoBaseAddr:
+                voodoo->cmdfifo_base = (val & 0x3ff) << 12;
+                voodoo->cmdfifo_end = ((val >> 16) & 0x3ff) << 12;
+//                pclog("CMDFIFO base=%08x end=%08x\n", voodoo->cmdfifo_base, voodoo->cmdfifo_end);
+                break;
+
+                case SST_cmdFifoRdPtr:
+                voodoo->cmdfifo_rp = val;
+                break;
+                case SST_cmdFifoAMin:
+                voodoo->cmdfifo_amin = val;
+                break;
+                case SST_cmdFifoAMax:
+                voodoo->cmdfifo_amax = val;
+                break;
+                case SST_cmdFifoDepth:
+                voodoo->cmdfifo_depth_rd = 0;
+                voodoo->cmdfifo_depth_wr = val & 0xffff;
+                break;
+
+                default:
+                if (voodoo->fbiInit7 & FBIINIT7_CMDFIFO_ENABLE)
+                {
+                        pclog("Unknown register write in CMDFIFO mode %08x %08x\n", addr, val);
+                }
+                else
+                {
+                        voodoo_queue_command(voodoo, addr | FIFO_WRITEL_REG, val);
+                }
+                break;
+        }
+}
+
+static uint16_t voodoo_snoop_readw(uint32_t addr, void *p)
+{
+        voodoo_set_t *set = (voodoo_set_t *)p;
+        
+        return voodoo_readw(addr, set->voodoos[0]);
+}
+static uint32_t voodoo_snoop_readl(uint32_t addr, void *p)
+{
+        voodoo_set_t *set = (voodoo_set_t *)p;
+        
+        return voodoo_readl(addr, set->voodoos[0]);
+}
+
+static void voodoo_snoop_writew(uint32_t addr, uint16_t val, void *p)
+{
+        voodoo_set_t *set = (voodoo_set_t *)p;
+
+        voodoo_writew(addr, val, set->voodoos[0]);
+        voodoo_writew(addr, val, set->voodoos[1]);
+}
+static void voodoo_snoop_writel(uint32_t addr, uint32_t val, void *p)
+{
+        voodoo_set_t *set = (voodoo_set_t *)p;
+
+        voodoo_writel(addr, val, set->voodoos[0]);
+        voodoo_writel(addr, val, set->voodoos[1]);
+}
+
+static void voodoo_recalcmapping(voodoo_set_t *set)
+{
+        if (set->nr_cards == 2)
+        {
+                if (set->voodoos[0]->pci_enable && set->voodoos[0]->memBaseAddr)
+                {
+                        if (set->voodoos[0]->type == VOODOO_2 && set->voodoos[1]->initEnable & (1 << 23))
+                        {
+                                pclog("voodoo_recalcmapping (pri) with snoop : memBaseAddr %08X\n", set->voodoos[0]->memBaseAddr);
+                                mem_mapping_disable(&set->voodoos[0]->mapping);
+                                mem_mapping_set_addr(&set->snoop_mapping, set->voodoos[0]->memBaseAddr, 0x01000000);
+                        }
+                        else if (set->voodoos[1]->pci_enable && (set->voodoos[0]->memBaseAddr == set->voodoos[1]->memBaseAddr))
+                        {
+                                pclog("voodoo_recalcmapping (pri) (sec) same addr : memBaseAddr %08X\n", set->voodoos[0]->memBaseAddr);
+                                mem_mapping_disable(&set->voodoos[0]->mapping);
+                                mem_mapping_disable(&set->voodoos[1]->mapping);
+                                mem_mapping_set_addr(&set->snoop_mapping, set->voodoos[0]->memBaseAddr, 0x01000000);
+                                return;
+                        }
+                        else
+                        {
+                                pclog("voodoo_recalcmapping (pri) : memBaseAddr %08X\n", set->voodoos[0]->memBaseAddr);
+                                mem_mapping_disable(&set->snoop_mapping);
+                                mem_mapping_set_addr(&set->voodoos[0]->mapping, set->voodoos[0]->memBaseAddr, 0x01000000);
+                        }
+                }
+                else
+                {
+                        pclog("voodoo_recalcmapping (pri) : disabled\n");
+                        mem_mapping_disable(&set->voodoos[0]->mapping);
+                }
+
+                if (set->voodoos[1]->pci_enable && set->voodoos[1]->memBaseAddr)
+                {
+                        pclog("voodoo_recalcmapping (sec) : memBaseAddr %08X\n", set->voodoos[1]->memBaseAddr);
+                        mem_mapping_set_addr(&set->voodoos[1]->mapping, set->voodoos[1]->memBaseAddr, 0x01000000);
+                }
+                else
+                {
+                        pclog("voodoo_recalcmapping (sec) : disabled\n");
+                        mem_mapping_disable(&set->voodoos[1]->mapping);
+                }
+        }
+        else
+        {
+                voodoo_t *voodoo = set->voodoos[0];
+                
+                if (voodoo->pci_enable && voodoo->memBaseAddr)
+                {
+                        pclog("voodoo_recalcmapping : memBaseAddr %08X\n", voodoo->memBaseAddr);
+                        mem_mapping_set_addr(&voodoo->mapping, voodoo->memBaseAddr, 0x01000000);
+                }
+                else
+                {
+                        pclog("voodoo_recalcmapping : disabled\n");
+                        mem_mapping_disable(&voodoo->mapping);
+                }
+        }
+}
+
+uint8_t voodoo_pci_read(int func, int addr, void *p)
+{
+        voodoo_t *voodoo = (voodoo_t *)p;
+
+        if (func)
+                return 0;
+
+//        pclog("Voodoo PCI read %08X PC=%08x\n", addr, cpu_state.pc);
+
+        switch (addr)
+        {
+                case 0x00: return 0x1a; /*3dfx*/
+                case 0x01: return 0x12;
+                
+                case 0x02:
+                if (voodoo->type == VOODOO_2)
+                        return 0x02; /*Voodoo 2*/
+                else
+                        return 0x01; /*SST-1 (Voodoo Graphics)*/
+                case 0x03: return 0x00;
+                
+                case 0x04: return voodoo->pci_enable ? 0x02 : 0x00; /*Respond to memory accesses*/
+
+                case 0x08: return 2; /*Revision ID*/
+                case 0x09: return 0; /*Programming interface*/
+                case 0x0a: return 0;
+                case 0x0b: return 0x04;
+                
+                case 0x10: return 0x00; /*memBaseAddr*/
+                case 0x11: return 0x00;
+                case 0x12: return 0x00;
+                case 0x13: return voodoo->memBaseAddr >> 24;
+
+                case 0x40:
+                return voodoo->initEnable & 0xff;
+                case 0x41:
+                if (voodoo->type == VOODOO_2)
+                        return 0x50 | ((voodoo->initEnable >> 8) & 0x0f);
+                return (voodoo->initEnable >> 8) & 0x0f;
+                case 0x42:
+                return (voodoo->initEnable >> 16) & 0xff;
+                case 0x43:
+                return (voodoo->initEnable >> 24) & 0xff;
+        }
+        return 0;
+}
+
+void voodoo_pci_write(int func, int addr, uint8_t val, void *p)
+{
+        voodoo_t *voodoo = (voodoo_t *)p;
+        
+        if (func)
+                return;
+
+//        pclog("Voodoo PCI write %04X %02X PC=%08x\n", addr, val, cpu_state.pc);
+
+        switch (addr)
+        {
+                case 0x04:
+                voodoo->pci_enable = val & 2;
+                voodoo_recalcmapping(voodoo->set);
+                break;
+                
+                case 0x13:
+                voodoo->memBaseAddr = val << 24;
+                voodoo_recalcmapping(voodoo->set);
+                break;
+                
+                case 0x40:
+                voodoo->initEnable = (voodoo->initEnable & ~0x000000ff) | val;
+                break;
+                case 0x41:
+                voodoo->initEnable = (voodoo->initEnable & ~0x0000ff00) | (val << 8);
+                break;
+                case 0x42:
+                voodoo->initEnable = (voodoo->initEnable & ~0x00ff0000) | (val << 16);
+                voodoo_recalcmapping(voodoo->set);
+                break;
+                case 0x43:
+                voodoo->initEnable = (voodoo->initEnable & ~0xff000000) | (val << 24);
+                voodoo_recalcmapping(voodoo->set);
+                break;
+        }
+}
+
+
+static void voodoo_add_status_info(char *s, int max_len, void *p)
+{
+        voodoo_set_t *voodoo_set = (voodoo_set_t *)p;
+        voodoo_t *voodoo = voodoo_set->voodoos[0];
+        voodoo_t *voodoo_slave = voodoo_set->voodoos[1];
+        char temps[512], temps2[256];
+        int pixel_count_current[4];
+        int pixel_count_total;
+        int texel_count_current[4];
+        int texel_count_total;
+        int render_time[4];
+        uint64_t new_time = timer_read();
+        uint64_t status_diff = new_time - status_time;
+        status_time = new_time;
+        int c;
+
+        if (!status_diff)
+                status_diff = 1;
+
+        for (c = 0; c < 4; c++)
+        {
+                pixel_count_current[c] = voodoo->pixel_count[c];
+                texel_count_current[c] = voodoo->texel_count[c];
+                render_time[c] = voodoo->render_time[c];
+        }
+        if (voodoo_set->nr_cards == 2)
+        {
+                for (c = 0; c < 4; c++)
+                {
+                        pixel_count_current[c] += voodoo_slave->pixel_count[c];
+                        texel_count_current[c] += voodoo_slave->texel_count[c];
+                        render_time[c] = (render_time[c] + voodoo_slave->render_time[c]) / 2;
+                }
+        }
+        pixel_count_total = (pixel_count_current[0] + pixel_count_current[1] + pixel_count_current[2] + pixel_count_current[3]) -
+                (voodoo->pixel_count_old[0] + voodoo->pixel_count_old[1] + voodoo->pixel_count_old[2] + voodoo->pixel_count_old[3]);
+        texel_count_total = (texel_count_current[0] + texel_count_current[1] + texel_count_current[2] + texel_count_current[3]) -
+                (voodoo->texel_count_old[0] + voodoo->texel_count_old[1] + voodoo->texel_count_old[2] + voodoo->texel_count_old[3]);
+        sprintf(temps, "%f Mpixels/sec (%f)\n%f Mtexels/sec (%f)\n%f ktris/sec\n%f%% CPU (%f%% real)\n%d frames/sec (%i)\n%f%% CPU (%f%% real)\n"/*%d reads/sec\n%d write/sec\n%d tex/sec\n*/,
+                (double)pixel_count_total/1000000.0,
+                ((double)pixel_count_total/1000000.0) / ((double)render_time[0] / status_diff),
+                (double)texel_count_total/1000000.0,
+                ((double)texel_count_total/1000000.0) / ((double)render_time[0] / status_diff),
+                (double)voodoo->tri_count/1000.0, ((double)voodoo->time * 100.0) / timer_freq, ((double)voodoo->time * 100.0) / status_diff, voodoo->frame_count, voodoo_recomp,
+                ((double)voodoo->render_time[0] * 100.0) / timer_freq, ((double)voodoo->render_time[0] * 100.0) / status_diff);
+        if (voodoo->render_threads >= 2)
+        {
+                sprintf(temps2, "%f%% CPU (%f%% real)\n",
+                        ((double)voodoo->render_time[1] * 100.0) / timer_freq, ((double)voodoo->render_time[1] * 100.0) / status_diff);
+                strncat(temps, temps2, sizeof(temps)-1);
+        }
+        if (voodoo->render_threads == 4)
+        {
+                sprintf(temps2, "%f%% CPU (%f%% real)\n%f%% CPU (%f%% real)\n",
+                        ((double)voodoo->render_time[2] * 100.0) / timer_freq, ((double)voodoo->render_time[2] * 100.0) / status_diff,
+                        ((double)voodoo->render_time[3] * 100.0) / timer_freq, ((double)voodoo->render_time[3] * 100.0) / status_diff);
+                strncat(temps, temps2, sizeof(temps)-1);
+        }
+        if (voodoo_set->nr_cards == 2)
+        {
+                sprintf(temps2, "%f%% CPU (%f%% real)\n",
+                        ((double)voodoo_slave->render_time[0] * 100.0) / timer_freq, ((double)voodoo_slave->render_time[0] * 100.0) / status_diff);
+                strncat(temps, temps2, sizeof(temps)-1);
+                        
+                if (voodoo_slave->render_threads >= 2)
+                {
+                        sprintf(temps2, "%f%% CPU (%f%% real)\n",
+                                ((double)voodoo_slave->render_time[1] * 100.0) / timer_freq, ((double)voodoo_slave->render_time[1] * 100.0) / status_diff);
+                        strncat(temps, temps2, sizeof(temps)-1);
+                }
+                if (voodoo_slave->render_threads == 4)
+                {
+                        sprintf(temps2, "%f%% CPU (%f%% real)\n%f%% CPU (%f%% real)\n",
+                                ((double)voodoo_slave->render_time[2] * 100.0) / timer_freq, ((double)voodoo_slave->render_time[2] * 100.0) / status_diff,
+                                ((double)voodoo_slave->render_time[3] * 100.0) / timer_freq, ((double)voodoo_slave->render_time[3] * 100.0) / status_diff);
+                        strncat(temps, temps2, sizeof(temps)-1);
+                }
+        }
+        strncat(s, temps, max_len);
+
+        for (c = 0; c < 4; c++)
+        {
+                voodoo->pixel_count_old[c] = pixel_count_current[c];
+                voodoo->texel_count_old[c] = texel_count_current[c];
+                voodoo->render_time[c] = 0;
+        }
+        voodoo->tri_count = voodoo->frame_count = 0;
+        voodoo->rd_count = voodoo->wr_count = voodoo->tex_count = 0;
+        voodoo->time = 0;
+        if (voodoo_set->nr_cards == 2)
+        {
+                for (c = 0; c < 4; c++)
+                {
+                        voodoo_slave->pixel_count_old[c] = pixel_count_current[c];
+                        voodoo_slave->texel_count_old[c] = texel_count_current[c];
+                        voodoo_slave->render_time[c] = 0;
+                }
+                voodoo_slave->tri_count = voodoo_slave->frame_count = 0;
+                voodoo_slave->rd_count = voodoo_slave->wr_count = voodoo_slave->tex_count = 0;
+                voodoo_slave->time = 0;
+        }
+        voodoo_recomp = 0;
+}
+
+static void voodoo_speed_changed(void *p)
+{
+        voodoo_set_t *voodoo_set = (voodoo_set_t *)p;
+        
+        voodoo_pixelclock_update(voodoo_set->voodoos[0]);
+        voodoo_set->voodoos[0]->read_time = pci_nonburst_time + pci_burst_time * ((voodoo_set->voodoos[0]->fbiInit4 & 1) ? 2 : 1);
+        voodoo_set->voodoos[0]->write_time = pci_nonburst_time + pci_burst_time * ((voodoo_set->voodoos[0]->fbiInit1 & 2) ? 1 : 0);
+        voodoo_set->voodoos[0]->burst_time = pci_burst_time * ((voodoo_set->voodoos[0]->fbiInit1 & 2) ? 2 : 1);
+        if (voodoo_set->nr_cards == 2)
+        {
+                voodoo_pixelclock_update(voodoo_set->voodoos[1]);
+                voodoo_set->voodoos[1]->read_time = pci_nonburst_time + pci_burst_time * ((voodoo_set->voodoos[1]->fbiInit4 & 1) ? 2 : 1);
+                voodoo_set->voodoos[1]->write_time = pci_nonburst_time + pci_burst_time * ((voodoo_set->voodoos[1]->fbiInit1 & 2) ? 1 : 0);
+                voodoo_set->voodoos[1]->burst_time = pci_burst_time * ((voodoo_set->voodoos[1]->fbiInit1 & 2) ? 2 : 1);
+        }
+//        pclog("Voodoo read_time=%i write_time=%i burst_time=%i %08x %08x\n", voodoo->read_time, voodoo->write_time, voodoo->burst_time, voodoo->fbiInit1, voodoo->fbiInit4);
+}
+
+void *voodoo_card_init()
+{
+        int c;
+        voodoo_t *voodoo = malloc(sizeof(voodoo_t));
+        memset(voodoo, 0, sizeof(voodoo_t));
+
+        voodoo->bilinear_enabled = device_get_config_int("bilinear");
+        voodoo->scrfilter = device_get_config_int("dacfilter");
+        voodoo->texture_size = device_get_config_int("texture_memory");
+        voodoo->texture_mask = (voodoo->texture_size << 20) - 1;
+        voodoo->fb_size = device_get_config_int("framebuffer_memory");
+        voodoo->fb_mask = (voodoo->fb_size << 20) - 1;
+        voodoo->render_threads = device_get_config_int("render_threads");
+        voodoo->odd_even_mask = voodoo->render_threads - 1;
+#ifndef NO_CODEGEN
+        voodoo->use_recompiler = device_get_config_int("recompiler");
+#endif                        
+        voodoo->type = device_get_config_int("type");
+        switch (voodoo->type)
+        {
+                case VOODOO_1:
+                voodoo->dual_tmus = 0;
+                break;
+                case VOODOO_SB50:
+                voodoo->dual_tmus = 1;
+                break;
+                case VOODOO_2:
+                voodoo->dual_tmus = 1;
+                break;
+        }
+        
+       if (voodoo->type == VOODOO_2) /*generate filter lookup tables*/
+               voodoo_generate_filter_v2(voodoo);
+       else
+               voodoo_generate_filter_v1(voodoo);
+        
+        pci_add(voodoo_pci_read, voodoo_pci_write, voodoo);
+
+        mem_mapping_add(&voodoo->mapping, 0, 0, NULL, voodoo_readw, voodoo_readl, NULL, voodoo_writew, voodoo_writel,     NULL, MEM_MAPPING_EXTERNAL, voodoo);
+
+        voodoo->fb_mem = malloc(4 * 1024 * 1024);
+        voodoo->tex_mem[0] = malloc(voodoo->texture_size * 1024 * 1024);
+        if (voodoo->dual_tmus)
+                voodoo->tex_mem[1] = malloc(voodoo->texture_size * 1024 * 1024);
+        voodoo->tex_mem_w[0] = (uint16_t *)voodoo->tex_mem[0];
+        voodoo->tex_mem_w[1] = (uint16_t *)voodoo->tex_mem[1];
+        
+        for (c = 0; c < TEX_CACHE_MAX; c++)
+        {
+                voodoo->texture_cache[0][c].data = malloc((256*256 + 256*256 + 128*128 + 64*64 + 32*32 + 16*16 + 8*8 + 4*4 + 2*2) * 4);
+                voodoo->texture_cache[0][c].base = -1; /*invalid*/
+                voodoo->texture_cache[0][c].refcount = 0;
+                if (voodoo->dual_tmus)
+                {
+                        voodoo->texture_cache[1][c].data = malloc((256*256 + 256*256 + 128*128 + 64*64 + 32*32 + 16*16 + 8*8 + 4*4 + 2*2) * 4);
+                        voodoo->texture_cache[1][c].base = -1; /*invalid*/
+                        voodoo->texture_cache[1][c].refcount = 0;
+                }
+        }
+
+        timer_add(&voodoo->timer, voodoo_callback, voodoo, 1);
+        
+        voodoo->svga = svga_get_pri();
+        voodoo->fbiInit0 = 0;
+
+        voodoo->wake_fifo_thread = thread_create_event();
+        voodoo->wake_render_thread[0] = thread_create_event();
+        voodoo->wake_render_thread[1] = thread_create_event();
+        voodoo->wake_render_thread[2] = thread_create_event();
+        voodoo->wake_render_thread[3] = thread_create_event();
+        voodoo->wake_main_thread = thread_create_event();
+        voodoo->fifo_not_full_event = thread_create_event();
+        voodoo->render_not_full_event[0] = thread_create_event();
+        voodoo->render_not_full_event[1] = thread_create_event();
+        voodoo->render_not_full_event[2] = thread_create_event();
+        voodoo->render_not_full_event[3] = thread_create_event();
+        voodoo->fifo_thread = thread_create(voodoo_fifo_thread, voodoo);
+        voodoo->render_thread[0] = thread_create(voodoo_render_thread_1, voodoo);
+        if (voodoo->render_threads >= 2)
+                voodoo->render_thread[1] = thread_create(voodoo_render_thread_2, voodoo);
+        if (voodoo->render_threads == 4)
+        {
+                voodoo->render_thread[2] = thread_create(voodoo_render_thread_3, voodoo);
+                voodoo->render_thread[3] = thread_create(voodoo_render_thread_4, voodoo);
+        }
+        voodoo->swap_mutex = thread_create_mutex();
+        timer_add(&voodoo->wake_timer, voodoo_wake_timer, (void *)voodoo, 0);
+        
+        for (c = 0; c < 0x100; c++)
+        {
+                rgb332[c].r = c & 0xe0;
+                rgb332[c].g = (c << 3) & 0xe0;
+                rgb332[c].b = (c << 6) & 0xc0;
+                rgb332[c].r = rgb332[c].r | (rgb332[c].r >> 3) | (rgb332[c].r >> 6);
+                rgb332[c].g = rgb332[c].g | (rgb332[c].g >> 3) | (rgb332[c].g >> 6);
+                rgb332[c].b = rgb332[c].b | (rgb332[c].b >> 2);
+                rgb332[c].b = rgb332[c].b | (rgb332[c].b >> 4);
+                rgb332[c].a = 0xff;
+                
+                ai44[c].a = (c & 0xf0) | ((c & 0xf0) >> 4);
+                ai44[c].r = (c & 0x0f) | ((c & 0x0f) << 4);
+                ai44[c].g = ai44[c].b = ai44[c].r;
+        }
+                
+        for (c = 0; c < 0x10000; c++)
+        {
+                rgb565[c].r = (c >> 8) & 0xf8;
+                rgb565[c].g = (c >> 3) & 0xfc;
+                rgb565[c].b = (c << 3) & 0xf8;
+                rgb565[c].r |= (rgb565[c].r >> 5);
+                rgb565[c].g |= (rgb565[c].g >> 6);
+                rgb565[c].b |= (rgb565[c].b >> 5);
+                rgb565[c].a = 0xff;
+
+                argb1555[c].r = (c >> 7) & 0xf8;
+                argb1555[c].g = (c >> 2) & 0xf8;
+                argb1555[c].b = (c << 3) & 0xf8;
+                argb1555[c].r |= (argb1555[c].r >> 5);
+                argb1555[c].g |= (argb1555[c].g >> 5);
+                argb1555[c].b |= (argb1555[c].b >> 5);
+                argb1555[c].a = (c & 0x8000) ? 0xff : 0;
+
+                argb4444[c].a = (c >> 8) & 0xf0;
+                argb4444[c].r = (c >> 4) & 0xf0;
+                argb4444[c].g = c & 0xf0;
+                argb4444[c].b = (c << 4) & 0xf0;
+                argb4444[c].a |= (argb4444[c].a >> 4);
+                argb4444[c].r |= (argb4444[c].r >> 4);
+                argb4444[c].g |= (argb4444[c].g >> 4);
+                argb4444[c].b |= (argb4444[c].b >> 4);
+                
+                ai88[c].a = (c >> 8);
+                ai88[c].r = c & 0xff;
+                ai88[c].g = c & 0xff;
+                ai88[c].b = c & 0xff;
+        }
+#ifndef NO_CODEGEN
+        voodoo_codegen_init(voodoo);
+#endif
+
+        voodoo->disp_buffer = 0;
+        voodoo->draw_buffer = 1;
+        
+        return voodoo;
+}
+
+void *voodoo_2d3d_card_init(int type)
+{
+        int c;
+        voodoo_t *voodoo = malloc(sizeof(voodoo_t));
+        memset(voodoo, 0, sizeof(voodoo_t));
+
+        voodoo->bilinear_enabled = device_get_config_int("bilinear");
+        voodoo->scrfilter = device_get_config_int("dacfilter");
+        voodoo->render_threads = device_get_config_int("render_threads");
+        voodoo->odd_even_mask = voodoo->render_threads - 1;
+#ifndef NO_CODEGEN
+        voodoo->use_recompiler = device_get_config_int("recompiler");
+#endif
+        voodoo->type = type;
+        voodoo->dual_tmus = (type == VOODOO_3) ? 1 : 0;
+
+       /*generate filter lookup tables*/
+       voodoo_generate_filter_v2(voodoo);
+
+        for (c = 0; c < TEX_CACHE_MAX; c++)
+        {
+                voodoo->texture_cache[0][c].data = malloc((256*256 + 256*256 + 128*128 + 64*64 + 32*32 + 16*16 + 8*8 + 4*4 + 2*2) * 4);
+                voodoo->texture_cache[0][c].base = -1; /*invalid*/
+                voodoo->texture_cache[0][c].refcount = 0;
+                if (voodoo->dual_tmus)
+                {
+                        voodoo->texture_cache[1][c].data = malloc((256*256 + 256*256 + 128*128 + 64*64 + 32*32 + 16*16 + 8*8 + 4*4 + 2*2) * 4);
+                        voodoo->texture_cache[1][c].base = -1; /*invalid*/
+                        voodoo->texture_cache[1][c].refcount = 0;
+                }
+        }
+
+        timer_add(&voodoo->timer, voodoo_callback, voodoo, 1);
+
+        voodoo->fbiInit0 = 0;
+
+        voodoo->wake_fifo_thread = thread_create_event();
+        voodoo->wake_render_thread[0] = thread_create_event();
+        voodoo->wake_render_thread[1] = thread_create_event();
+        voodoo->wake_render_thread[2] = thread_create_event();
+        voodoo->wake_render_thread[3] = thread_create_event();
+        voodoo->wake_main_thread = thread_create_event();
+        voodoo->fifo_not_full_event = thread_create_event();
+        voodoo->render_not_full_event[0] = thread_create_event();
+        voodoo->render_not_full_event[1] = thread_create_event();
+        voodoo->render_not_full_event[2] = thread_create_event();
+        voodoo->render_not_full_event[3] = thread_create_event();
+        voodoo->fifo_thread = thread_create(voodoo_fifo_thread, voodoo);
+        voodoo->render_thread[0] = thread_create(voodoo_render_thread_1, voodoo);
+        if (voodoo->render_threads >= 2)
+                voodoo->render_thread[1] = thread_create(voodoo_render_thread_2, voodoo);
+        if (voodoo->render_threads == 4)
+        {
+                voodoo->render_thread[2] = thread_create(voodoo_render_thread_3, voodoo);
+                voodoo->render_thread[3] = thread_create(voodoo_render_thread_4, voodoo);
+        }
+        voodoo->swap_mutex = thread_create_mutex();
+        timer_add(&voodoo->wake_timer, voodoo_wake_timer, (void *)voodoo, 0);
+
+        for (c = 0; c < 0x100; c++)
+        {
+                rgb332[c].r = c & 0xe0;
+                rgb332[c].g = (c << 3) & 0xe0;
+                rgb332[c].b = (c << 6) & 0xc0;
+                rgb332[c].r = rgb332[c].r | (rgb332[c].r >> 3) | (rgb332[c].r >> 6);
+                rgb332[c].g = rgb332[c].g | (rgb332[c].g >> 3) | (rgb332[c].g >> 6);
+                rgb332[c].b = rgb332[c].b | (rgb332[c].b >> 2);
+                rgb332[c].b = rgb332[c].b | (rgb332[c].b >> 4);
+                rgb332[c].a = 0xff;
+
+                ai44[c].a = (c & 0xf0) | ((c & 0xf0) >> 4);
+                ai44[c].r = (c & 0x0f) | ((c & 0x0f) << 4);
+                ai44[c].g = ai44[c].b = ai44[c].r;
+        }
+
+        for (c = 0; c < 0x10000; c++)
+        {
+                rgb565[c].r = (c >> 8) & 0xf8;
+                rgb565[c].g = (c >> 3) & 0xfc;
+                rgb565[c].b = (c << 3) & 0xf8;
+                rgb565[c].r |= (rgb565[c].r >> 5);
+                rgb565[c].g |= (rgb565[c].g >> 6);
+                rgb565[c].b |= (rgb565[c].b >> 5);
+                rgb565[c].a = 0xff;
+
+                argb1555[c].r = (c >> 7) & 0xf8;
+                argb1555[c].g = (c >> 2) & 0xf8;
+                argb1555[c].b = (c << 3) & 0xf8;
+                argb1555[c].r |= (argb1555[c].r >> 5);
+                argb1555[c].g |= (argb1555[c].g >> 5);
+                argb1555[c].b |= (argb1555[c].b >> 5);
+                argb1555[c].a = (c & 0x8000) ? 0xff : 0;
+
+                argb4444[c].a = (c >> 8) & 0xf0;
+                argb4444[c].r = (c >> 4) & 0xf0;
+                argb4444[c].g = c & 0xf0;
+                argb4444[c].b = (c << 4) & 0xf0;
+                argb4444[c].a |= (argb4444[c].a >> 4);
+                argb4444[c].r |= (argb4444[c].r >> 4);
+                argb4444[c].g |= (argb4444[c].g >> 4);
+                argb4444[c].b |= (argb4444[c].b >> 4);
+
+                ai88[c].a = (c >> 8);
+                ai88[c].r = c & 0xff;
+                ai88[c].g = c & 0xff;
+                ai88[c].b = c & 0xff;
+        }
+#ifndef NO_CODEGEN
+        voodoo_codegen_init(voodoo);
+#endif
+
+        voodoo->disp_buffer = 0;
+        voodoo->draw_buffer = 1;
+
+        return voodoo;
+}
+
+void *voodoo_init()
+{
+        voodoo_set_t *voodoo_set = malloc(sizeof(voodoo_set_t));
+        uint32_t tmuConfig = 1;
+        int type;
+        memset(voodoo_set, 0, sizeof(voodoo_set_t));
+        
+        type = device_get_config_int("type");
+        
+        voodoo_set->nr_cards = device_get_config_int("sli") ? 2 : 1;
+        voodoo_set->voodoos[0] = voodoo_card_init();
+        voodoo_set->voodoos[0]->set = voodoo_set;
+        if (voodoo_set->nr_cards == 2)
+        {
+                voodoo_set->voodoos[1] = voodoo_card_init();
+                                
+                voodoo_set->voodoos[1]->set = voodoo_set;
+
+                if (type == VOODOO_2)
+                {
+                        voodoo_set->voodoos[0]->fbiInit5 |= FBIINIT5_MULTI_CVG;
+                        voodoo_set->voodoos[1]->fbiInit5 |= FBIINIT5_MULTI_CVG;
+                }
+                else
+                {
+                        voodoo_set->voodoos[0]->fbiInit1 |= FBIINIT1_MULTI_SST;
+                        voodoo_set->voodoos[1]->fbiInit1 |= FBIINIT1_MULTI_SST;
+                }
+        }
+
+        switch (type)
+        {
+                case VOODOO_1:
+                if (voodoo_set->nr_cards == 2)
+                        tmuConfig = 1 | (3 << 3);
+                else
+                        tmuConfig = 1;
+                break;
+                case VOODOO_SB50:
+                if (voodoo_set->nr_cards == 2)
+                        tmuConfig = 1 | (3 << 3) | (3 << 6) | (2 << 9);
+                else
+                        tmuConfig = 1 | (3 << 6);
+                break;
+                case VOODOO_2:
+                tmuConfig = 1 | (3 << 6);
+                break;
+        }
+        
+        voodoo_set->voodoos[0]->tmuConfig = tmuConfig;
+        if (voodoo_set->nr_cards == 2)
+                voodoo_set->voodoos[1]->tmuConfig = tmuConfig;
+
+        mem_mapping_add(&voodoo_set->snoop_mapping, 0, 0, NULL, voodoo_snoop_readw, voodoo_snoop_readl, NULL, voodoo_snoop_writew, voodoo_snoop_writel,     NULL, MEM_MAPPING_EXTERNAL, voodoo_set);
+                
+        return voodoo_set;
+}
+
+void voodoo_card_close(voodoo_t *voodoo)
+{
+#ifndef RELEASE_BUILD
+        FILE *f;
+#endif
+        int c;
+        
+#ifndef RELEASE_BUILD        
+        if (voodoo->tex_mem[0])
+        {
+                f = romfopen("texram.dmp", "wb");
+                fwrite(voodoo->tex_mem[0], voodoo->texture_size*1024*1024, 1, f);
+                fclose(f);
+                if (voodoo->dual_tmus)
+                {
+                        f = romfopen("texram2.dmp", "wb");
+                        fwrite(voodoo->tex_mem[1], voodoo->texture_size*1024*1024, 1, f);
+                        fclose(f);
+                }
+        }
+#endif
+
+        thread_kill(voodoo->fifo_thread);
+        thread_kill(voodoo->render_thread[0]);
+        if (voodoo->render_threads >= 2)
+                thread_kill(voodoo->render_thread[1]);
+        if (voodoo->render_threads == 4)
+        {
+                thread_kill(voodoo->render_thread[2]);
+                thread_kill(voodoo->render_thread[3]);
+        }
+        thread_destroy_event(voodoo->fifo_not_full_event);
+        thread_destroy_event(voodoo->wake_main_thread);
+        thread_destroy_event(voodoo->wake_fifo_thread);
+        thread_destroy_event(voodoo->wake_render_thread[0]);
+        thread_destroy_event(voodoo->wake_render_thread[1]);
+        thread_destroy_event(voodoo->render_not_full_event[0]);
+        thread_destroy_event(voodoo->render_not_full_event[1]);
+
+        for (c = 0; c < TEX_CACHE_MAX; c++)
+        {
+                if (voodoo->dual_tmus)
+                        free(voodoo->texture_cache[1][c].data);
+                free(voodoo->texture_cache[0][c].data);
+        }
+#ifndef NO_CODEGEN
+        voodoo_codegen_close(voodoo);
+#endif
+        if (voodoo->type < VOODOO_BANSHEE && voodoo->fb_mem)
+        {
+                free(voodoo->fb_mem);
+                if (voodoo->dual_tmus)
+                        free(voodoo->tex_mem[1]);
+                free(voodoo->tex_mem[0]);
+        }
+        free(voodoo);
+}
+
+void voodoo_close(void *p)
+{
+        voodoo_set_t *voodoo_set = (voodoo_set_t *)p;
+        
+        if (voodoo_set->nr_cards == 2)
+                voodoo_card_close(voodoo_set->voodoos[1]);
+        voodoo_card_close(voodoo_set->voodoos[0]);
+        
+        free(voodoo_set);
+}
+
+static device_config_t voodoo_config[] =
+{
+        {
+                .name = "type",
+                .description = "Voodoo type",
+                .type = CONFIG_SELECTION,
+                .selection =
+                {
+                        {
+                                .description = "Voodoo Graphics",
+                                .value = VOODOO_1
+                        },
+                        {
+                                .description = "Obsidian SB50 + Amethyst (2 TMUs)",
+                                .value = VOODOO_SB50
+                        },
+                        {
+                                .description = "Voodoo 2",
+                                .value = VOODOO_2
+                        },
+                        {
+                                .description = ""
+                        }
+                },
+                .default_int = 0
+        },
+        {
+                .name = "framebuffer_memory",
+                .description = "Framebuffer memory size",
+                .type = CONFIG_SELECTION,
+                .selection =
+                {
+                        {
+                                .description = "2 MB",
+                                .value = 2
+                        },
+                        {
+                                .description = "4 MB",
+                                .value = 4
+                        },
+                        {
+                                .description = ""
+                        }
+                },
+                .default_int = 2
+        },
+        {
+                .name = "texture_memory",
+                .description = "Texture memory size",
+                .type = CONFIG_SELECTION,
+                .selection =
+                {
+                        {
+                                .description = "2 MB",
+                                .value = 2
+                        },
+                        {
+                                .description = "4 MB",
+                                .value = 4
+                        },
+                        {
+                                .description = ""
+                        }
+                },
+                .default_int = 2
+        },
+        {
+                .name = "bilinear",
+                .description = "Bilinear filtering",
+                .type = CONFIG_BINARY,
+                .default_int = 1
+        },
+        {
+                .name = "dacfilter",
+                .description = "Screen Filter",
+                .type = CONFIG_BINARY,
+                .default_int = 0
+        },
+        {
+                .name = "render_threads",
+                .description = "Render threads",
+                .type = CONFIG_SELECTION,
+                .selection =
+                {
+                        {
+                                .description = "1",
+                                .value = 1
+                        },
+                        {
+                                .description = "2",
+                                .value = 2
+                        },
+                        {
+                                .description = "4",
+                                .value = 4
+                        },
+                        {
+                                .description = ""
+                        }
+                },
+                .default_int = 2
+        },
+        {
+                .name = "sli",
+                .description = "SLI",
+                .type = CONFIG_BINARY,
+                .default_int = 0
+        },
+#ifndef NO_CODEGEN
+        {
+                .name = "recompiler",
+                .description = "Recompiler",
+                .type = CONFIG_BINARY,
+                .default_int = 1
+        },
+#endif
+        {
+                .type = -1
+        }
+};
+
+device_t voodoo_device =
+{
+        "3DFX Voodoo Graphics",
+        DEVICE_PCI,
+        voodoo_init,
+        voodoo_close,
+        NULL,
+        voodoo_speed_changed,
+        NULL,
+        voodoo_add_status_info,
+        voodoo_config
+};
diff --git a/pcem/vid_voodoo.h b/pcem/vid_voodoo.h
new file mode 100644 (file)
index 0000000..0479783
--- /dev/null
@@ -0,0 +1 @@
+extern device_t voodoo_device;
diff --git a/pcem/vid_voodoo_banshee.cpp b/pcem/vid_voodoo_banshee.cpp
new file mode 100644 (file)
index 0000000..2ac803e
--- /dev/null
@@ -0,0 +1,2881 @@
+#include <stdlib.h>
+#include "ibm.h"
+#include "device.h"
+#include "io.h"
+#include "mem.h"
+#include "pci.h"
+#include "rom.h"
+#include "thread.h"
+#include "video.h"
+#include "vid_ddc.h"
+#include "vid_svga.h"
+#include "vid_svga_render.h"
+#include "vid_voodoo_banshee.h"
+#include "vid_voodoo_common.h"
+#include "vid_voodoo_display.h"
+#include "vid_voodoo_fifo.h"
+#include "vid_voodoo_regs.h"
+#include "vid_voodoo_render.h"
+#include "x86.h"
+
+#ifdef CLAMP
+#undef CLAMP
+#endif
+
+static uint8_t vb_filter_v1_rb[256][256];
+static uint8_t vb_filter_v1_g [256][256];
+
+static uint8_t vb_filter_bx_rb[256][256];
+static uint8_t vb_filter_bx_g [256][256];
+
+enum
+{
+        TYPE_BANSHEE = 0,
+        TYPE_V3_2000,
+        TYPE_V3_3000
+};
+
+typedef struct banshee_t
+{
+        svga_t svga;
+        
+        rom_t bios_rom;
+        
+        uint8_t pci_regs[256];
+        
+        uint32_t memBaseAddr0;
+        uint32_t memBaseAddr1;
+        uint32_t ioBaseAddr;
+
+        uint32_t agpInit0;
+        uint32_t dramInit0, dramInit1;
+        uint32_t lfbMemoryConfig;
+        uint32_t miscInit0, miscInit1;
+        uint32_t pciInit0;
+        uint32_t vgaInit0, vgaInit1;
+        
+        uint32_t command_2d;
+        uint32_t srcBaseAddr_2d;
+        
+        uint32_t pllCtrl0, pllCtrl1, pllCtrl2;
+        
+        uint32_t dacMode;
+        int dacAddr;
+
+        uint32_t vidDesktopOverlayStride;
+        uint32_t vidDesktopStartAddr;
+        uint32_t vidProcCfg;
+        uint32_t vidScreenSize;
+        uint32_t vidSerialParallelPort;
+        
+        int overlay_pix_fmt;
+        
+        uint32_t hwCurPatAddr, hwCurLoc, hwCurC0, hwCurC1;
+
+        uint32_t intrCtrl;
+        
+        uint32_t overlay_buffer[2][4096];
+
+        mem_mapping_t linear_mapping;
+
+        mem_mapping_t reg_mapping_low;  /*0000000-07fffff*/
+        mem_mapping_t reg_mapping_high; /*0c00000-1ffffff - Windows 2000 puts the BIOS ROM in between these two areas*/
+        
+        voodoo_t *voodoo;
+        
+        uint32_t desktop_addr;
+        int desktop_y;
+        uint32_t desktop_stride_tiled;
+
+        int type;
+} banshee_t;
+
+enum
+{
+        Init_status     = 0x00,
+        Init_pciInit0   = 0x04,
+        Init_lfbMemoryConfig = 0x0c,
+        Init_miscInit0  = 0x10,
+        Init_miscInit1  = 0x14,
+        Init_dramInit0  = 0x18,
+        Init_dramInit1  = 0x1c,
+        Init_agpInit0   = 0x20,
+        Init_vgaInit0   = 0x28,
+        Init_vgaInit1   = 0x2c,
+        Init_2dCommand     = 0x30,
+        Init_2dSrcBaseAddr = 0x34,
+        Init_strapInfo  = 0x38,
+        
+        PLL_pllCtrl0    = 0x40,
+        PLL_pllCtrl1    = 0x44,
+        PLL_pllCtrl2    = 0x48,
+        
+        DAC_dacMode     = 0x4c,
+        DAC_dacAddr     = 0x50,
+        DAC_dacData     = 0x54,
+        
+        Video_vidProcCfg = 0x5c,
+        Video_maxRgbDelta = 0x58,
+        Video_hwCurPatAddr = 0x60,
+        Video_hwCurLoc     = 0x64,
+        Video_hwCurC0      = 0x68,
+        Video_hwCurC1      = 0x6c,
+        Video_vidSerialParallelPort = 0x78,
+        Video_vidScreenSize = 0x98,
+        Video_vidOverlayStartCoords = 0x9c,
+        Video_vidOverlayEndScreenCoords = 0xa0,
+        Video_vidOverlayDudx = 0xa4,
+        Video_vidOverlayDudxOffsetSrcWidth = 0xa8,
+        Video_vidOverlayDvdy = 0xac,
+        Video_vidOverlayDvdyOffset = 0xe0,
+        Video_vidDesktopStartAddr = 0xe4,
+        Video_vidDesktopOverlayStride = 0xe8
+};
+
+enum
+{
+        cmdBaseAddr0  = 0x20,
+        cmdBaseSize0  = 0x24,
+        cmdBump0      = 0x28,
+        cmdRdPtrL0    = 0x2c,
+        cmdRdPtrH0    = 0x30,
+        cmdAMin0      = 0x34,
+        cmdAMax0      = 0x3c,
+        cmdFifoDepth0 = 0x44,
+        cmdHoleCnt0   = 0x48
+};
+
+#define VGAINIT0_EXTENDED_SHIFT_OUT (1 << 12)
+
+#define VIDPROCCFG_CURSOR_MODE (1 << 1)
+#define VIDPROCCFG_HALF_MODE (1 << 4)
+#define VIDPROCCFG_OVERLAY_ENABLE (1 << 8)
+#define VIDPROCCFG_OVERLAY_CLUT_BYPASS (1 << 11)
+#define VIDPROCCFG_OVERLAY_CLUT_SEL (1 << 13)
+#define VIDPROCCFG_H_SCALE_ENABLE (1 << 14)
+#define VIDPROCCFG_V_SCALE_ENABLE (1 << 15)
+#define VIDPROCCFG_FILTER_MODE_MASK (3 << 16)
+#define VIDPROCCFG_FILTER_MODE_POINT      (0 << 16)
+#define VIDPROCCFG_FILTER_MODE_DITHER_2X2 (1 << 16)
+#define VIDPROCCFG_FILTER_MODE_DITHER_4X4 (2 << 16)
+#define VIDPROCCFG_FILTER_MODE_BILINEAR   (3 << 16)
+#define VIDPROCCFG_DESKTOP_PIX_FORMAT ((banshee->vidProcCfg >> 18) & 7)
+#define VIDPROCCFG_OVERLAY_PIX_FORMAT ((banshee->vidProcCfg >> 21) & 7)
+#define VIDPROCCFG_OVERLAY_PIX_FORMAT_SHIFT (21)
+#define VIDPROCCFG_OVERLAY_PIX_FORMAT_MASK (7 << VIDPROCCFG_OVERLAY_PIX_FORMAT_SHIFT)
+#define VIDPROCCFG_DESKTOP_TILE (1 << 24)
+#define VIDPROCCFG_OVERLAY_TILE (1 << 25)
+#define VIDPROCCFG_2X_MODE      (1 << 26)
+#define VIDPROCCFG_HWCURSOR_ENA (1 << 27)
+
+#define OVERLAY_FMT_565        (1)
+#define OVERLAY_FMT_YUYV422    (5)
+#define OVERLAY_FMT_UYVY422    (6)
+#define OVERLAY_FMT_565_DITHER (7)
+
+#define OVERLAY_START_X_MASK (0xfff)
+#define OVERLAY_START_Y_SHIFT (12)
+#define OVERLAY_START_Y_MASK (0xfff << OVERLAY_START_Y_SHIFT)
+
+#define OVERLAY_END_X_MASK (0xfff)
+#define OVERLAY_END_Y_SHIFT (12)
+#define OVERLAY_END_Y_MASK (0xfff << OVERLAY_END_Y_SHIFT)
+
+#define OVERLAY_SRC_WIDTH_SHIFT (19)
+#define OVERLAY_SRC_WIDTH_MASK  (0x1fff << OVERLAY_SRC_WIDTH_SHIFT)
+
+#define VID_STRIDE_OVERLAY_SHIFT (16)
+#define VID_STRIDE_OVERLAY_MASK (0x7fff << VID_STRIDE_OVERLAY_SHIFT)
+
+#define VID_DUDX_MASK (0xffffff)
+#define VID_DVDY_MASK (0xffffff)
+
+#define PIX_FORMAT_8      0
+#define PIX_FORMAT_RGB565 1
+#define PIX_FORMAT_RGB24  2
+#define PIX_FORMAT_RGB32  3
+
+#define VIDSERIAL_DDC_DCK_W (1 << 19)
+#define VIDSERIAL_DDC_DDA_W (1 << 20)
+#define VIDSERIAL_DDC_DCK_R (1 << 21)
+#define VIDSERIAL_DDC_DDA_R (1 << 22)
+#define VIDSERIAL_I2C_SCK_W (1 << 24)
+#define VIDSERIAL_I2C_SDA_W (1 << 25)
+#define VIDSERIAL_I2C_SCK_R (1 << 26)
+#define VIDSERIAL_I2C_SDA_R (1 << 27)
+
+static uint32_t banshee_status(banshee_t *banshee);
+
+static void banshee_out(uint16_t addr, uint8_t val, void *p)
+{
+        banshee_t *banshee = (banshee_t *)p;
+        svga_t *svga = &banshee->svga;
+        uint8_t old;
+        
+//        /*if (addr != 0x3c9) */pclog("banshee_out : %04X %02X  %04X:%04X\n", addr, val, CS,cpu_state.pc);
+                
+        if (((addr & 0xfff0) == 0x3d0 || (addr & 0xfff0) == 0x3b0) && !(svga->miscout & 1)) 
+                addr ^= 0x60;
+
+        switch (addr)
+        {
+                case 0x3D4:
+                svga->crtcreg = val & 0x3f;
+                return;
+                case 0x3D5:
+                if ((svga->crtcreg < 7) && (svga->crtc[0x11] & 0x80))
+                        return;
+                if ((svga->crtcreg == 7) && (svga->crtc[0x11] & 0x80))
+                        val = (svga->crtc[7] & ~0x10) | (val & 0x10);
+                old = svga->crtc[svga->crtcreg];
+                svga->crtc[svga->crtcreg] = val;
+                if (old != val)
+                {
+                        if (svga->crtcreg < 0xe || svga->crtcreg > 0x10)
+                        {
+                                svga->fullchange = changeframecount;
+                                svga_recalctimings(svga);
+                        }
+                }
+                break;
+        }
+        svga_out(addr, val, svga);
+}
+
+static uint8_t banshee_in(uint16_t addr, void *p)
+{
+        banshee_t *banshee = (banshee_t *)p;
+        svga_t *svga = &banshee->svga;
+        uint8_t temp;
+
+//        if (addr != 0x3da) pclog("banshee_in : %04X ", addr);
+                
+        if (((addr & 0xfff0) == 0x3d0 || (addr & 0xfff0) == 0x3b0) && !(svga->miscout & 1)) 
+                addr ^= 0x60;
+             
+        switch (addr)
+        {
+                case 0x3c2:
+                if ((svga->vgapal[0].r + svga->vgapal[0].g + svga->vgapal[0].b) >= 0x40)
+                        temp = 0;
+                else
+                        temp = 0x10;
+                break;
+                case 0x3D4:
+                temp = svga->crtcreg;
+                break;
+                case 0x3D5:
+                temp = svga->crtc[svga->crtcreg];
+                break;
+                default:
+                temp = svga_in(addr, svga);
+                break;
+        }
+//        if (addr != 0x3da) pclog("%02X  %04X:%04X %i\n", temp, CS,cpu_state.pc, ins);
+        return temp;
+}
+
+static void banshee_updatemapping(banshee_t *banshee)
+{
+        svga_t *svga = &banshee->svga;
+
+        if (!(banshee->pci_regs[PCI_REG_COMMAND] & PCI_COMMAND_MEM))
+        {
+//                pclog("Update mapping - PCI disabled\n");
+                mem_mapping_disable(&svga->mapping);
+                mem_mapping_disable(&banshee->linear_mapping);
+                mem_mapping_disable(&banshee->reg_mapping_low);
+                mem_mapping_disable(&banshee->reg_mapping_high);
+                return;
+        }
+
+        pclog("Update mapping - bank %02X ", svga->gdcreg[6] & 0xc);        
+        switch (svga->gdcreg[6] & 0xc) /*Banked framebuffer*/
+        {
+                case 0x0: /*128k at A0000*/
+                mem_mapping_set_addr(&svga->mapping, 0xa0000, 0x20000);
+                svga->banked_mask = 0xffff;
+                break;
+                case 0x4: /*64k at A0000*/
+                mem_mapping_set_addr(&svga->mapping, 0xa0000, 0x10000);
+                svga->banked_mask = 0xffff;
+                break;
+                case 0x8: /*32k at B0000*/
+                mem_mapping_set_addr(&svga->mapping, 0xb0000, 0x08000);
+                svga->banked_mask = 0x7fff;
+                break;
+                case 0xC: /*32k at B8000*/
+                mem_mapping_set_addr(&svga->mapping, 0xb8000, 0x08000);
+                svga->banked_mask = 0x7fff;
+                break;
+        }
+        
+        pclog("Linear framebuffer %08X  ", banshee->memBaseAddr1);
+        mem_mapping_set_addr(&banshee->linear_mapping, banshee->memBaseAddr1, 32 << 20);
+        pclog("registers %08X\n", banshee->memBaseAddr0);
+        mem_mapping_set_addr(&banshee->reg_mapping_low, banshee->memBaseAddr0, 8 << 20);
+        mem_mapping_set_addr(&banshee->reg_mapping_high, banshee->memBaseAddr0 + 0xc00000, 20 << 20);
+}
+
+static void banshee_render_16bpp_tiled(svga_t *svga)
+{
+        banshee_t *banshee = (banshee_t *)svga->p;
+        int x;
+        int offset = 32;
+        uint32_t *p = &((uint32_t *)buffer32->line[svga->displine])[offset];
+        uint32_t addr;
+        int drawn = 0;
+
+        if (banshee->vidProcCfg & VIDPROCCFG_HALF_MODE)
+                addr = banshee->desktop_addr + ((banshee->desktop_y >> 1) & 31) * 128 + ((banshee->desktop_y >> 6) * banshee->desktop_stride_tiled);
+        else
+                addr = banshee->desktop_addr + (banshee->desktop_y & 31) * 128 + ((banshee->desktop_y >> 5) * banshee->desktop_stride_tiled);
+
+        for (x = 0; x <= svga->hdisp; x += 64)
+        {
+                if (svga->hwcursor_on || svga->overlay_on)
+                        svga->changedvram[addr >> 12] = 2;
+                if (svga->changedvram[addr >> 12] || svga->fullchange)
+                {
+                        uint16_t *vram_p = (uint16_t *)&svga->vram[addr & svga->vram_display_mask];
+                        int xx;
+
+                        for (xx = 0; xx < 64; xx++)
+                                *p++ = video_16to32[*vram_p++];
+
+                        drawn = 1;
+                }
+                else
+                        p += 64;
+                addr += 128*32;
+        }
+
+        if (drawn)
+        {
+                if (svga->firstline_draw == 2000)
+                        svga->firstline_draw = svga->displine;
+                svga->lastline_draw = svga->displine;
+        }
+
+        banshee->desktop_y++;
+}
+
+static void banshee_recalctimings(svga_t *svga)
+{
+        banshee_t *banshee = (banshee_t *)svga->p;
+        voodoo_t *voodoo = banshee->voodoo;
+        
+/*7 R/W Horizontal Retrace End bit 5. -
+  6 R/W Horizontal Retrace Start bit 8 0x4
+  5 R/W Horizontal Blank End bit 6. -
+  4 R/W Horizontal Blank Start bit 8. 0x3
+  3 R/W Reserved. -
+  2 R/W Horizontal Display Enable End bit 8. 0x1
+  1 R/W Reserved. -
+  0 R/W Horizontal Total bit 8. 0x0*/
+        if (svga->crtc[0x1a] & 0x01) svga->htotal      += 0x100;
+        if (svga->crtc[0x1a] & 0x04) svga->hdisp       += 0x100;
+/*6 R/W Vertical Retrace Start bit 10 0x10
+  5 R/W Reserved. -
+  4 R/W Vertical Blank Start bit 10. 0x15
+  3 R/W Reserved. -
+  2 R/W Vertical Display Enable End bit 10 0x12
+  1 R/W Reserved. -
+  0 R/W Vertical Total bit 10. 0x6*/
+        if (svga->crtc[0x1b] & 0x01) svga->vtotal      += 0x400;
+        if (svga->crtc[0x1b] & 0x04) svga->dispend     += 0x400;
+        if (svga->crtc[0x1b] & 0x10) svga->vblankstart += 0x400;
+        if (svga->crtc[0x1b] & 0x40) svga->vsyncstart  += 0x400;
+//        pclog("svga->hdisp=%i\n", svga->hdisp);
+
+        if (banshee->vgaInit0 & VGAINIT0_EXTENDED_SHIFT_OUT)
+        {
+                switch (VIDPROCCFG_DESKTOP_PIX_FORMAT)
+                {
+                        case PIX_FORMAT_8:
+                        svga->render = svga_render_8bpp_highres;
+                        svga->bpp = 8;
+                        break;
+                        case PIX_FORMAT_RGB565:
+                        svga->render = (banshee->vidProcCfg & VIDPROCCFG_DESKTOP_TILE) ? banshee_render_16bpp_tiled : svga_render_16bpp_highres;
+                        svga->bpp = 16;
+                        break;
+                        case PIX_FORMAT_RGB24:
+                        svga->render = svga_render_24bpp_highres;
+                        svga->bpp = 24;
+                        break;
+                        case PIX_FORMAT_RGB32:
+                        svga->render = svga_render_32bpp_highres;
+                        svga->bpp = 32;
+                        break;
+
+#ifndef RELEASE_BUILD
+                        default:
+                        fatal("Unknown pixel format %08x\n", banshee->vgaInit0);
+#endif
+                }
+                svga->rowcount = 0;
+                if (!(banshee->vidProcCfg & VIDPROCCFG_DESKTOP_TILE) && (banshee->vidProcCfg & VIDPROCCFG_HALF_MODE))
+                        svga->linedbl = 1;
+                else
+                        svga->linedbl = 0;
+                if (banshee->vidProcCfg & VIDPROCCFG_DESKTOP_TILE)
+                        svga->rowoffset = ((banshee->vidDesktopOverlayStride & 0x3fff) * 128) >> 3;
+                else
+                        svga->rowoffset = (banshee->vidDesktopOverlayStride & 0x3fff) >> 3;
+                svga->ma_latch = banshee->vidDesktopStartAddr >> 2;
+                banshee->desktop_stride_tiled = (banshee->vidDesktopOverlayStride & 0x3fff) * 128 * 32;
+//                pclog("Extended shift out %i rowoffset=%i %02x\n", VIDPROCCFG_DESKTOP_PIX_FORMAT, svga->rowoffset, svga->crtc[1]);
+
+                svga->char_width = 8;
+                svga->split = 99999;
+
+                if (banshee->vidProcCfg & VIDPROCCFG_2X_MODE)
+                {
+                        svga->hdisp *= 2;
+                        svga->htotal *= 2;
+                }
+
+                svga->overlay.ena = banshee->vidProcCfg & VIDPROCCFG_OVERLAY_ENABLE;
+
+                svga->overlay.x = voodoo->overlay.start_x;
+                svga->overlay.y = voodoo->overlay.start_y;
+                svga->overlay.xsize = voodoo->overlay.size_x;
+                svga->overlay.ysize = voodoo->overlay.size_y;
+                svga->overlay.pitch = (banshee->vidDesktopOverlayStride & VID_STRIDE_OVERLAY_MASK) >> VID_STRIDE_OVERLAY_SHIFT;
+                if (banshee->vidProcCfg & VIDPROCCFG_OVERLAY_TILE)
+                        svga->overlay.pitch *= 128*32;
+                if (svga->overlay.xsize <= 0 || svga->overlay.ysize <= 0)
+                        svga->overlay.ena = 0;
+                if (svga->overlay.ena)
+                {
+/*                        pclog("Overlay enabled : start=%i,%i end=%i,%i size=%i,%i pitch=%x\n",
+                                voodoo->overlay.start_x, voodoo->overlay.start_y,
+                                voodoo->overlay.end_x, voodoo->overlay.end_y,
+                                voodoo->overlay.size_x, voodoo->overlay.size_y,
+                                svga->overlay.pitch);*/
+                        if (!voodoo->overlay.start_x && !voodoo->overlay.start_y &&
+                            svga->hdisp == voodoo->overlay.size_x && svga->dispend == voodoo->overlay.size_y)
+                        {
+                                /*Overlay is full screen, so don't bother rendering the desktop
+                                  behind it*/
+                                svga->render = svga_render_null;
+                                svga->bpp = 0;
+                        }
+                }
+
+                svga->video_res_override = 1;
+                svga->video_res_x = svga->hdisp;
+                svga->video_res_y = svga->dispend;
+                svga->video_bpp = svga->bpp;
+        }
+        else
+        {
+//                pclog("Normal shift out\n");
+                svga->bpp = 8;
+                svga->video_res_override = 0;
+        }
+
+        if (((svga->miscout >> 2) & 3) == 3)
+        {
+                int k = banshee->pllCtrl0 & 3;
+                int m = (banshee->pllCtrl0 >> 2) & 0x3f;
+                int n = (banshee->pllCtrl0 >> 8) & 0xff;
+                double freq = (((double)n + 2) / (((double)m + 2) * (double)(1 << k))) * 14318184.0;
+
+                svga->clock = (cpuclock * (float)(1ull << 32)) / freq;
+//                svga->clock = cpuclock / freq;
+                
+//                pclog("svga->clock = %g %g  m=%i k=%i n=%i\n", freq, freq / 1000000.0, m, k, n);
+        }
+}
+
+static void banshee_ext_out(uint16_t addr, uint8_t val, void *p)
+{
+//        banshee_t *banshee = (banshee_t *)p;
+//        svga_t *svga = &banshee->svga;
+
+//        pclog("banshee_ext_out: addr=%04x val=%02x\n", addr, val);
+        
+        switch (addr & 0xff)
+        {
+                case 0xb0: case 0xb1: case 0xb2: case 0xb3:
+                case 0xb4: case 0xb5: case 0xb6: case 0xb7:
+                case 0xb8: case 0xb9: case 0xba: case 0xbb:
+                case 0xbc: case 0xbd: case 0xbe: case 0xbf:
+                case 0xc0: case 0xc1: case 0xc2: case 0xc3:
+                case 0xc4: case 0xc5: case 0xc6: case 0xc7:
+                case 0xc8: case 0xc9: case 0xca: case 0xcb:
+                case 0xcc: case 0xcd: case 0xce: case 0xcf:
+                case 0xd0: case 0xd1: case 0xd2: case 0xd3:
+                case 0xd4: case 0xd5: case 0xd6: case 0xd7:
+                case 0xd8: case 0xd9: case 0xda: case 0xdb:
+                case 0xdc: case 0xdd: case 0xde: case 0xdf:
+                banshee_out((addr & 0xff)+0x300, val, p);
+                break;
+                        
+                default:
+                pclog("bad banshee_ext_out: addr=%04x val=%02x\n", addr, val);
+        }
+}
+static void banshee_ext_outl(uint16_t addr, uint32_t val, void *p)
+{
+        banshee_t *banshee = (banshee_t *)p;
+        voodoo_t *voodoo = banshee->voodoo;
+        svga_t *svga = &banshee->svga;
+
+//        pclog("banshee_ext_outl: addr=%04x val=%08x %04x(%08x):%08x\n", addr, val, CS,cs,cpu_state.pc);
+        
+        switch (addr & 0xff)
+        {
+                case Init_pciInit0:
+                banshee->pciInit0 = val;
+                voodoo->read_time = pci_nonburst_time + pci_burst_time * ((val & 0x100) ? 2 : 1);
+                voodoo->burst_time = pci_burst_time * ((val & 0x200) ? 1 : 0);
+                voodoo->write_time = pci_nonburst_time + voodoo->burst_time;
+                break;
+                        
+                case Init_lfbMemoryConfig:
+                banshee->lfbMemoryConfig = val;
+//                pclog("lfbMemoryConfig=%08x\n", val);
+                voodoo->tile_base = (val & 0x1fff) << 12;
+                voodoo->tile_stride = 1024 << ((val >> 13) & 7);
+                voodoo->tile_stride_shift = 10 + ((val >> 13) & 7);
+                voodoo->tile_x = ((val >> 16) & 0x7f) * 128;
+                voodoo->tile_x_real = ((val >> 16) & 0x7f) * 128*32;
+                break;
+
+                case Init_miscInit0:
+                banshee->miscInit0 = val;
+                break;
+                case Init_miscInit1:
+                banshee->miscInit1 = val;
+                break;
+                case Init_dramInit0:
+                banshee->dramInit0 = val;
+                break;
+                case Init_dramInit1:
+                banshee->dramInit1 = val;
+                break;
+                case Init_agpInit0:
+                banshee->agpInit0 = val;
+                break;
+
+                case Init_2dCommand:
+                banshee->command_2d = val;
+                break;
+                case Init_2dSrcBaseAddr:
+                banshee->srcBaseAddr_2d = val;
+                break;
+                case Init_vgaInit0:
+                banshee->vgaInit0 = val;
+                break;
+                case Init_vgaInit1:
+                banshee->vgaInit1 = val;
+                svga->write_bank = (val & 0x3ff) << 15;
+                svga->read_bank = ((val >> 10) & 0x3ff) << 15;
+                break;
+
+                case PLL_pllCtrl0:
+                banshee->pllCtrl0 = val;
+                break;
+                case PLL_pllCtrl1:
+                banshee->pllCtrl1 = val;
+                break;
+                case PLL_pllCtrl2:
+                banshee->pllCtrl2 = val;
+                break;
+
+                case DAC_dacMode:
+                banshee->dacMode = val;
+                break;
+                case DAC_dacAddr:
+                banshee->dacAddr = val & 0x1ff;
+                break;
+                case DAC_dacData:
+                svga->pallook[banshee->dacAddr] = val & 0xffffff;
+                svga->fullchange = changeframecount;
+                break;
+
+                case Video_vidProcCfg:                                
+                banshee->vidProcCfg = val;
+//                pclog("vidProcCfg=%08x\n", val);
+                banshee->overlay_pix_fmt = (val & VIDPROCCFG_OVERLAY_PIX_FORMAT_MASK) >> VIDPROCCFG_OVERLAY_PIX_FORMAT_SHIFT;
+                svga->hwcursor.ena = val & VIDPROCCFG_HWCURSOR_ENA;
+                svga->fullchange = changeframecount;
+                svga_recalctimings(svga);
+                break;
+
+                case Video_maxRgbDelta:
+                banshee->voodoo->scrfilterThreshold = val;
+                if (val > 0x00)
+                        banshee->voodoo->scrfilterEnabled = 1;
+                else
+                        banshee->voodoo->scrfilterEnabled = 0;
+                voodoo_threshold_check(banshee->voodoo);
+                pclog("Banshee Filter: %06x\n", val);
+
+                break;
+
+                case Video_hwCurPatAddr:
+                banshee->hwCurPatAddr = val;
+                svga->hwcursor.addr = (val & 0xfffff0) + (svga->hwcursor.yoff * 16);
+                break;
+                case Video_hwCurLoc:
+                banshee->hwCurLoc = val;
+                svga->hwcursor.x = (val & 0x7ff) - 32;
+                svga->hwcursor.y = ((val >> 16) & 0x7ff) - 64;
+                if (svga->hwcursor.y < 0)
+                {
+                        svga->hwcursor.yoff = -svga->hwcursor.y;
+                        svga->hwcursor.y = 0;
+                }
+                else
+                        svga->hwcursor.yoff = 0;
+                svga->hwcursor.addr = (banshee->hwCurPatAddr & 0xfffff0) + (svga->hwcursor.yoff * 16);
+                svga->hwcursor.xsize = 64;
+                svga->hwcursor.ysize = 64;
+//                pclog("hwCurLoc %08x %i\n", val, svga->hwcursor.y);
+                break;
+                case Video_hwCurC0:
+                banshee->hwCurC0 = val;
+                break;
+                case Video_hwCurC1:
+                banshee->hwCurC1 = val;
+                break;
+
+                case Video_vidSerialParallelPort:
+                banshee->vidSerialParallelPort = val;
+//                pclog("vidSerialParallelPort: write %08x %08x %04x(%08x):%08x\n", val, val & (VIDSERIAL_DDC_DCK_W | VIDSERIAL_DDC_DDA_W), CS,cs,cpu_state.pc);
+                ddc_i2c_change((val & VIDSERIAL_DDC_DCK_W) ? 1 : 0, (val & VIDSERIAL_DDC_DDA_W) ? 1 : 0);
+                break;
+
+                case Video_vidScreenSize:
+                banshee->vidScreenSize = val;
+                voodoo->h_disp = (val & 0xfff) + 1;
+                voodoo->v_disp = (val >> 12) & 0xfff;
+                break;
+                case Video_vidOverlayStartCoords:
+                voodoo->overlay.vidOverlayStartCoords = val;
+                voodoo->overlay.start_x = val & OVERLAY_START_X_MASK;
+                voodoo->overlay.start_y = (val & OVERLAY_START_Y_MASK) >> OVERLAY_START_Y_SHIFT;
+                voodoo->overlay.size_x = voodoo->overlay.end_x - voodoo->overlay.start_x;
+                voodoo->overlay.size_y = voodoo->overlay.end_y - voodoo->overlay.start_y;
+                svga_recalctimings(svga);
+                break;
+                case Video_vidOverlayEndScreenCoords:
+                voodoo->overlay.vidOverlayEndScreenCoords = val;
+                voodoo->overlay.end_x = val & OVERLAY_END_X_MASK;
+                voodoo->overlay.end_y = (val & OVERLAY_END_Y_MASK) >> OVERLAY_END_Y_SHIFT;
+                voodoo->overlay.size_x = (voodoo->overlay.end_x - voodoo->overlay.start_x) + 1;
+                voodoo->overlay.size_y = (voodoo->overlay.end_y - voodoo->overlay.start_y) + 1;
+                svga_recalctimings(svga);
+                break;
+                case Video_vidOverlayDudx:
+                voodoo->overlay.vidOverlayDudx = val & VID_DUDX_MASK;
+//                pclog("vidOverlayDudx=%08x\n", val);
+                break;
+                case Video_vidOverlayDudxOffsetSrcWidth:
+                voodoo->overlay.vidOverlayDudxOffsetSrcWidth = val;
+                voodoo->overlay.overlay_bytes = (val & OVERLAY_SRC_WIDTH_MASK) >> OVERLAY_SRC_WIDTH_SHIFT;
+//                pclog("vidOverlayDudxOffsetSrcWidth=%08x\n", val);
+                break;
+                case Video_vidOverlayDvdy:
+                voodoo->overlay.vidOverlayDvdy = val & VID_DVDY_MASK;
+//                pclog("vidOverlayDvdy=%08x\n", val);
+                break;
+                case Video_vidOverlayDvdyOffset:
+                voodoo->overlay.vidOverlayDvdyOffset = val;
+                break;
+
+
+                case Video_vidDesktopStartAddr:
+                banshee->vidDesktopStartAddr = val & 0xffffff;
+//                pclog("vidDesktopStartAddr=%08x\n", val);
+                svga->fullchange = changeframecount;
+                svga_recalctimings(svga);
+                break;
+                case Video_vidDesktopOverlayStride:
+                banshee->vidDesktopOverlayStride = val;
+//                pclog("vidDesktopOverlayStride=%08x\n", val);
+                svga->fullchange = changeframecount;
+                svga_recalctimings(svga);
+                break;
+//                default:
+//                fatal("bad banshee_ext_outl: addr=%04x val=%08x\n", addr, val);
+        }
+}
+
+static uint8_t banshee_ext_in(uint16_t addr, void *p)
+{
+        banshee_t *banshee = (banshee_t *)p;
+//        svga_t *svga = &banshee->svga;
+        uint8_t ret = 0xff;
+             
+        switch (addr & 0xff)
+        {
+                case Init_status: case Init_status+1: case Init_status+2: case Init_status+3:
+                ret = (banshee_status(banshee) >> ((addr & 3) * 8)) & 0xff;
+//                pclog("Read status reg! %04x(%08x):%08x\n", CS, cs, cpu_state.pc);
+                break;
+
+                case 0xb0: case 0xb1: case 0xb2: case 0xb3:
+                case 0xb4: case 0xb5: case 0xb6: case 0xb7:
+                case 0xb8: case 0xb9: case 0xba: case 0xbb:
+                case 0xbc: case 0xbd: case 0xbe: case 0xbf:
+                case 0xc0: case 0xc1: case 0xc2: case 0xc3:
+                case 0xc4: case 0xc5: case 0xc6: case 0xc7:
+                case 0xc8: case 0xc9: case 0xca: case 0xcb:
+                case 0xcc: case 0xcd: case 0xce: case 0xcf:
+                case 0xd0: case 0xd1: case 0xd2: case 0xd3:
+                case 0xd4: case 0xd5: case 0xd6: case 0xd7:
+                case 0xd8: case 0xd9: case 0xda: case 0xdb:
+                case 0xdc: case 0xdd: case 0xde: case 0xdf:
+                ret = banshee_in((addr & 0xff)+0x300, p);
+                break;
+
+                default:
+                pclog("bad banshee_ext_in: addr=%04x\n", addr);
+                break;
+        }
+
+//        pclog("banshee_ext_in: addr=%04x val=%02x\n", addr, ret);
+        
+        return ret;
+}
+
+static uint32_t banshee_status(banshee_t *banshee)
+{
+        voodoo_t *voodoo = banshee->voodoo;
+        svga_t *svga = &banshee->svga;
+        int fifo_entries = FIFO_ENTRIES;
+        int fifo_size = 0xffff - fifo_entries;
+        int swap_count = voodoo->swap_count;
+        int written = voodoo->cmd_written + voodoo->cmd_written_fifo;
+        int busy = (written - voodoo->cmd_read) || (voodoo->cmdfifo_depth_rd != voodoo->cmdfifo_depth_wr) ||
+                voodoo->render_voodoo_busy[0] || voodoo->render_voodoo_busy[1] ||
+                voodoo->render_voodoo_busy[2] || voodoo->render_voodoo_busy[3] ||
+                voodoo->voodoo_busy;
+        uint32_t ret;
+
+        ret = 0;
+        if (fifo_size < 0x20)
+                ret |= fifo_size;
+        else
+                ret |= 0x1f;
+        if (fifo_size)
+                ret |= 0x20;
+        if (swap_count < 7)
+                ret |= (swap_count << 28);
+        else
+                ret |= (7 << 28);
+        if (!(svga->cgastat & 8))
+                ret |= 0x40;
+
+        if (busy)
+                ret |= 0x780; /*Busy*/
+
+        if (voodoo->cmdfifo_depth_rd != voodoo->cmdfifo_depth_wr)
+                ret |= (1 << 11);
+
+        if (!voodoo->voodoo_busy)
+                voodoo_wake_fifo_thread(voodoo);
+
+//        pclog("banshee_status: busy %i  %i (%i %i)  %i   %i %i  %04x(%08x):%08x %08x\n", busy, written, voodoo->cmd_written, voodoo->cmd_written_fifo, voodoo->cmd_read, voodoo->cmdfifo_depth_rd, voodoo->cmdfifo_depth_wr, CS,cs,cpu_state.pc, ret);
+
+        return ret;
+}
+
+static uint32_t banshee_ext_inl(uint16_t addr, void *p)
+{
+        banshee_t *banshee = (banshee_t *)p;
+        voodoo_t *voodoo = banshee->voodoo;
+        svga_t *svga = &banshee->svga;
+        uint32_t ret = 0xffffffff;
+
+        cycles -= voodoo->read_time;
+        
+        switch (addr & 0xff)
+        {
+                case Init_status:
+                ret = banshee_status(banshee);
+//                pclog("Read status reg! %04x(%08x):%08x\n", CS, cs, cpu_state.pc);
+                break;
+                case Init_pciInit0:
+                ret = banshee->pciInit0;
+                break;
+                case Init_lfbMemoryConfig:
+                ret = banshee->lfbMemoryConfig;
+                break;
+                
+                case Init_miscInit0:
+                ret = banshee->miscInit0;
+                break;
+                case Init_miscInit1:
+                ret = banshee->miscInit1;
+                break;
+                case Init_dramInit0:
+                ret = banshee->dramInit0;
+                break;
+                case Init_dramInit1:
+                ret = banshee->dramInit1;
+                break;
+                case Init_agpInit0:
+                ret = banshee->agpInit0;
+                break;
+
+                case Init_vgaInit0:
+                ret = banshee->vgaInit0;
+                break;
+                case Init_vgaInit1:
+                ret = banshee->vgaInit1;
+                break;
+
+                case Init_2dCommand:
+                ret = banshee->command_2d;
+                break;
+                case Init_2dSrcBaseAddr:
+                ret = banshee->srcBaseAddr_2d;
+                break;
+                case Init_strapInfo:
+                ret = 0x00000040; /*8 MB SGRAM, PCI, IRQ enabled, 32kB BIOS*/
+                break;
+
+                case PLL_pllCtrl0:
+                ret = banshee->pllCtrl0;
+                break;
+                case PLL_pllCtrl1:
+                ret = banshee->pllCtrl1;
+                break;
+                case PLL_pllCtrl2:
+                ret = banshee->pllCtrl2;
+                break;
+                
+                case DAC_dacMode:
+                ret = banshee->dacMode;
+                break;
+                case DAC_dacAddr:
+                ret = banshee->dacAddr;
+                break;
+                case DAC_dacData:
+                ret = svga->pallook[banshee->dacAddr];
+                break;
+
+                case Video_vidProcCfg:                                
+                ret = banshee->vidProcCfg;
+                break;
+
+                case Video_hwCurPatAddr:
+                ret = banshee->hwCurPatAddr;
+                break;
+                case Video_hwCurLoc:
+                ret = banshee->hwCurLoc;
+                break;
+                case Video_hwCurC0:
+                ret = banshee->hwCurC0;
+                break;
+                case Video_hwCurC1:
+                ret = banshee->hwCurC1;
+                break;
+
+                case Video_vidSerialParallelPort:
+                ret = banshee->vidSerialParallelPort & ~(VIDSERIAL_DDC_DCK_R | VIDSERIAL_DDC_DDA_R);
+                if ((banshee->vidSerialParallelPort & VIDSERIAL_DDC_DCK_W) && ddc_read_clock())
+                        ret |= VIDSERIAL_DDC_DCK_R;
+                if ((banshee->vidSerialParallelPort & VIDSERIAL_DDC_DDA_W) && ddc_read_data())
+                        ret |= VIDSERIAL_DDC_DDA_R;
+                ret = ret & ~(VIDSERIAL_I2C_SCK_R | VIDSERIAL_I2C_SDA_R);
+                if (banshee->vidSerialParallelPort & VIDSERIAL_I2C_SCK_W)
+                        ret |= VIDSERIAL_I2C_SCK_R;
+                if (banshee->vidSerialParallelPort & VIDSERIAL_I2C_SDA_W)
+                        ret |= VIDSERIAL_I2C_SDA_R;
+//                pclog("vidSerialParallelPort: read %08x %08x  %04x(%08x):%08x\n", ret, ret & (VIDSERIAL_DDC_DCK_R | VIDSERIAL_DDC_DDA_R), CS,cs,cpu_state.pc);
+                break;
+
+                case Video_vidScreenSize:
+                ret = banshee->vidScreenSize;
+                break;
+                case Video_vidOverlayStartCoords:
+                ret = voodoo->overlay.vidOverlayStartCoords;
+                break;
+                case Video_vidOverlayEndScreenCoords:
+                ret = voodoo->overlay.vidOverlayEndScreenCoords;
+                break;
+                case Video_vidOverlayDudx:
+                ret = voodoo->overlay.vidOverlayDudx;
+                break;
+                case Video_vidOverlayDudxOffsetSrcWidth:
+                ret = voodoo->overlay.vidOverlayDudxOffsetSrcWidth;
+                break;
+                case Video_vidOverlayDvdy:
+                ret = voodoo->overlay.vidOverlayDvdy;
+                break;
+                case Video_vidOverlayDvdyOffset:
+                ret = voodoo->overlay.vidOverlayDvdyOffset;
+                break;
+
+                case Video_vidDesktopStartAddr:
+                ret = banshee->vidDesktopStartAddr;
+                break;
+                case Video_vidDesktopOverlayStride:
+                ret = banshee->vidDesktopOverlayStride;
+                break;
+
+                default:
+//                fatal("bad banshee_ext_inl: addr=%04x\n", addr);
+                break;
+        }
+
+//        /*if (addr) */pclog("banshee_ext_inl: addr=%04x val=%08x\n", addr, ret);
+        
+        return ret;
+}
+
+
+static uint32_t banshee_reg_readl(uint32_t addr, void *p);
+
+static uint8_t banshee_reg_read(uint32_t addr, void *p)
+{
+//        pclog("banshee_reg_read: addr=%08x\n", addr);
+        return banshee_reg_readl(addr & ~3, p) >> (8*(addr & 3));
+}
+
+static uint16_t banshee_reg_readw(uint32_t addr, void *p)
+{
+//        pclog("banshee_reg_readw: addr=%08x\n", addr);
+        return banshee_reg_readl(addr & ~3, p) >> (8*(addr & 2));
+}
+
+static uint32_t banshee_cmd_read(banshee_t *banshee, uint32_t addr)
+{
+        voodoo_t *voodoo = banshee->voodoo;
+        uint32_t ret = 0xffffffff;
+
+        switch (addr & 0x1fc)
+        {
+                case cmdBaseAddr0:
+                ret = voodoo->cmdfifo_base >> 12;
+//                pclog("Read cmdfifo_base %08x\n", ret);
+                break;
+                
+                case cmdRdPtrL0:
+                ret = voodoo->cmdfifo_rp;
+//                pclog("Read cmdfifo_rp %08x\n", ret);
+                break;
+                
+                case cmdFifoDepth0:
+                ret = voodoo->cmdfifo_depth_wr - voodoo->cmdfifo_depth_rd;
+//                pclog("Read cmdfifo_depth %08x\n", ret);
+                break;
+
+                case 0x108:
+                break;
+
+#ifndef RELEASE_BUILD
+                default:
+                fatal("Unknown banshee_cmd_read %08x\n", addr);
+#endif
+        }
+        
+        return ret;
+}
+
+static uint32_t banshee_reg_readl(uint32_t addr, void *p)
+{
+        banshee_t *banshee = (banshee_t *)p;
+        voodoo_t *voodoo = banshee->voodoo;
+        uint32_t ret = 0xffffffff;
+        
+        cycles -= voodoo->read_time;
+
+        switch (addr & 0x1f00000)
+        {
+                case 0x0000000: /*IO remap*/
+                if (!(addr & 0x80000))
+                        ret = banshee_ext_inl(addr & 0xff, banshee);
+                else
+                        ret = banshee_cmd_read(banshee, addr);
+                break;
+                
+                case 0x0100000: /*2D registers*/
+                voodoo_flush(voodoo);
+                switch (addr & 0x1fc)
+                {
+                        case 0x08:
+                        ret = voodoo->banshee_blt.clip0Min;
+                        break;
+                        case 0x0c:
+                        ret = voodoo->banshee_blt.clip0Max;
+                        break;
+                        case 0x10:
+                        ret = voodoo->banshee_blt.dstBaseAddr;
+                        break;
+                        case 0x14:
+                        ret = voodoo->banshee_blt.dstFormat;
+                        break;
+                        case 0x34:
+                        ret = voodoo->banshee_blt.srcBaseAddr;
+                        break;
+                        case 0x38:
+                        ret = voodoo->banshee_blt.commandExtra;
+                        break;
+                        case 0x5c:
+                        ret = voodoo->banshee_blt.srcXY;
+                        break;
+                        case 0x60:
+                        ret = voodoo->banshee_blt.colorBack;
+                        break;
+                        case 0x64:
+                        ret = voodoo->banshee_blt.colorFore;
+                        break;
+                        case 0x68:
+                        ret = voodoo->banshee_blt.dstSize;
+                        break;
+                        case 0x6c:
+                        ret = voodoo->banshee_blt.dstXY;
+                        break;
+                        case 0x70:
+                        ret = voodoo->banshee_blt.command;
+                        break;
+                        default:
+                        pclog("banshee_reg_readl: addr=%08x\n", addr);
+                }
+                break;
+
+                case 0x0200000: case 0x0300000: case 0x0400000: case 0x0500000: /*3D registers*/
+                switch (addr & 0x3fc)
+                {
+                        case SST_status:
+                        ret = banshee_status(banshee);
+                        break;
+
+                        case SST_intrCtrl:
+                        ret = banshee->intrCtrl & 0x0030003f;
+                        break;
+                        
+                        case SST_fbzColorPath:
+                        voodoo_flush(voodoo);
+                        ret = voodoo->params.fbzColorPath;
+                        break;
+                        case SST_fogMode:
+                        voodoo_flush(voodoo);
+                        ret = voodoo->params.fogMode;
+                        break;
+                        case SST_alphaMode:
+                        voodoo_flush(voodoo);
+                        ret = voodoo->params.alphaMode;
+                        break;
+                        case SST_fbzMode:
+                        voodoo_flush(voodoo);
+                        ret = voodoo->params.fbzMode;
+                        break;
+                        case SST_lfbMode:
+                        voodoo_flush(voodoo);
+                        ret = voodoo->lfbMode;
+                        break;
+                        case SST_clipLeftRight:
+                        ret = voodoo->params.clipRight | (voodoo->params.clipLeft << 16);
+                        break;
+                        case SST_clipLowYHighY:
+                        ret = voodoo->params.clipHighY | (voodoo->params.clipLowY << 16);
+                        break;
+
+                        case SST_clipLeftRight1:
+                        ret = voodoo->params.clipRight1 | (voodoo->params.clipLeft1 << 16);
+                        break;
+                        case SST_clipTopBottom1:
+                        ret = voodoo->params.clipHighY1 | (voodoo->params.clipLowY1 << 16);
+                        break;
+
+                        case SST_stipple:
+                        voodoo_flush(voodoo);
+                        ret = voodoo->params.stipple;
+                        break;
+                        case SST_color0:
+                        voodoo_flush(voodoo);
+                        ret = voodoo->params.color0;
+                        break;
+                        case SST_color1:
+                        voodoo_flush(voodoo);
+                        ret = voodoo->params.color1;
+                        break;
+
+                        case SST_fbiPixelsIn:
+                        ret = voodoo->fbiPixelsIn & 0xffffff;
+                        break;
+                        case SST_fbiChromaFail:
+                        ret = voodoo->fbiChromaFail & 0xffffff;
+                        break;
+                        case SST_fbiZFuncFail:
+                        ret = voodoo->fbiZFuncFail & 0xffffff;
+                        break;
+                        case SST_fbiAFuncFail:
+                        ret = voodoo->fbiAFuncFail & 0xffffff;
+                        break;
+                        case SST_fbiPixelsOut:
+                        ret = voodoo->fbiPixelsOut & 0xffffff;
+                        break;
+
+                        default:
+                        pclog("banshee_reg_readl: 3D addr=%08x\n", addr);
+                        break;
+                }
+                break;
+        }
+
+//        /*if (addr != 0xe0000000) */pclog("banshee_reg_readl: addr=%08x ret=%08x %04x(%08x):%08x\n", addr, ret, CS,cs,cpu_state.pc);
+//        if (cpu_state.pc == 0x1000e437)
+//                output = 3;
+        return ret;
+}
+
+static void banshee_reg_write(uint32_t addr, uint8_t val, void *p)
+{
+//        pclog("banshee_reg_writeb: addr=%08x val=%02x\n", addr, val);
+}
+
+static void banshee_reg_writew(uint32_t addr, uint16_t val, void *p)
+{       
+        banshee_t *banshee = (banshee_t *)p;
+        voodoo_t *voodoo = banshee->voodoo;
+
+        cycles -= voodoo->write_time;
+        
+//        pclog("banshee_reg_writew: addr=%08x val=%04x\n", addr, val);
+        switch (addr & 0x1f00000)
+        {
+                case 0x1000000: case 0x1100000: case 0x1200000: case 0x1300000: /*3D LFB*/
+                case 0x1400000: case 0x1500000: case 0x1600000: case 0x1700000:
+                case 0x1800000: case 0x1900000: case 0x1a00000: case 0x1b00000:
+                case 0x1c00000: case 0x1d00000: case 0x1e00000: case 0x1f00000:
+                voodoo_queue_command(voodoo, (addr & 0xffffff) | FIFO_WRITEW_FB, val);
+                break;
+        }
+}
+
+static void banshee_cmd_write(banshee_t *banshee, uint32_t addr, uint32_t val)
+{
+        voodoo_t *voodoo = banshee->voodoo;
+//        pclog("banshee_cmd_write: addr=%03x val=%08x\n", addr & 0x1fc, val);
+        switch (addr & 0x1fc)
+        {
+                case cmdBaseAddr0:
+                voodoo->cmdfifo_base = (val & 0xfff) << 12;
+                voodoo->cmdfifo_end = voodoo->cmdfifo_base + (((voodoo->cmdfifo_size & 0xff) + 1) << 12);
+//                pclog("cmdfifo_base=%08x  cmdfifo_end=%08x %08x\n", voodoo->cmdfifo_base, voodoo->cmdfifo_end, val);
+                break;
+                
+                case cmdBaseSize0:
+                voodoo->cmdfifo_size = val;
+                voodoo->cmdfifo_end = voodoo->cmdfifo_base + (((voodoo->cmdfifo_size & 0xff) + 1) << 12);
+                voodoo->cmdfifo_enabled = val & 0x100;
+                if (!voodoo->cmdfifo_enabled)
+                        voodoo->cmdfifo_in_sub = 0; /*Not sure exactly when this should be reset*/
+//                pclog("cmdfifo_base=%08x  cmdfifo_end=%08x\n", voodoo->cmdfifo_base, voodoo->cmdfifo_end);
+                break;
+                
+//                voodoo->cmdfifo_end = ((val >> 16) & 0x3ff) << 12;
+//                pclog("CMDFIFO base=%08x end=%08x\n", voodoo->cmdfifo_base, voodoo->cmdfifo_end);
+//                break;
+
+                case cmdRdPtrL0:
+                voodoo->cmdfifo_rp = val;
+                break;
+                case cmdAMin0:
+                voodoo->cmdfifo_amin = val;
+                break;
+                case cmdAMax0:
+                voodoo->cmdfifo_amax = val;
+                break;
+                case cmdFifoDepth0:
+                voodoo->cmdfifo_depth_rd = 0;
+                voodoo->cmdfifo_depth_wr = val & 0xffff;
+                break;
+                
+                default:
+                pclog("Unknown banshee_cmd_write: addr=%08x val=%08x\n", addr, val);
+                break;
+        }
+
+/*        cmdBaseSize0  = 0x24,
+        cmdBump0      = 0x28,
+        cmdRdPtrL0    = 0x2c,
+        cmdRdPtrH0    = 0x30,
+        cmdAMin0      = 0x34,
+        cmdAMax0      = 0x3c,
+        cmdFifoDepth0 = 0x44,
+        cmdHoleCnt0   = 0x48
+        }*/
+}
+
+static void banshee_reg_writel(uint32_t addr, uint32_t val, void *p)
+{
+        banshee_t *banshee = (banshee_t *)p;
+        voodoo_t *voodoo = banshee->voodoo;
+        
+        if (addr == voodoo->last_write_addr+4)
+                cycles -= voodoo->burst_time;
+        else
+                cycles -= voodoo->write_time;
+        voodoo->last_write_addr = addr;
+
+//        pclog("banshee_reg_writel: addr=%08x val=%08x\n", addr, val);
+        
+        switch (addr & 0x1f00000)
+        {
+                case 0x0000000: /*IO remap*/
+                if (!(addr & 0x80000))
+                        banshee_ext_outl(addr & 0xff, val, banshee);
+                else
+                        banshee_cmd_write(banshee, addr, val);
+//                        pclog("CMD!!! write %08x %08x\n", addr, val);
+                break;
+
+                case 0x0100000: /*2D registers*/
+                voodoo_queue_command(voodoo, (addr & 0x1fc) | FIFO_WRITEL_2DREG, val);
+                break;
+                
+                case 0x0200000: case 0x0300000: case 0x0400000: case 0x0500000: /*3D registers*/
+                switch (addr & 0x3fc)
+                {
+                        case SST_intrCtrl:
+                        banshee->intrCtrl = val & 0x0030003f;
+//                        pclog("intrCtrl=%08x\n", val);
+                        break;
+
+                        case SST_userIntrCMD:
+#ifndef RELEASE_BUILD
+                        fatal("userIntrCMD write %08x\n", val);
+#endif
+                        break;
+
+                        case SST_swapbufferCMD:
+                        voodoo->cmd_written++;
+                        voodoo_queue_command(voodoo, (addr & 0x3fc) | FIFO_WRITEL_REG, val);
+                        if (!voodoo->voodoo_busy)
+                                voodoo_wake_fifo_threads(voodoo->set, voodoo);
+//                        pclog("SST_swapbufferCMD write: %i %i\n", voodoo->cmd_written, voodoo->cmd_written_fifo);
+                        break;
+                        case SST_triangleCMD:
+                        voodoo->cmd_written++;
+                        voodoo_queue_command(voodoo, (addr & 0x3fc) | FIFO_WRITEL_REG, val);
+                        if (!voodoo->voodoo_busy)
+                                voodoo_wake_fifo_threads(voodoo->set, voodoo);
+                        break;
+                        case SST_ftriangleCMD:
+                        voodoo->cmd_written++;
+                        voodoo_queue_command(voodoo, (addr & 0x3fc) | FIFO_WRITEL_REG, val);
+                        if (!voodoo->voodoo_busy)
+                                voodoo_wake_fifo_threads(voodoo->set, voodoo);
+                        break;
+                        case SST_fastfillCMD:
+                        voodoo->cmd_written++;
+                        voodoo_queue_command(voodoo, (addr & 0x3fc) | FIFO_WRITEL_REG, val);
+                        if (!voodoo->voodoo_busy)
+                                voodoo_wake_fifo_threads(voodoo->set, voodoo);
+                        break;
+                        case SST_nopCMD:
+                        voodoo->cmd_written++;
+                        voodoo_queue_command(voodoo, (addr & 0x3fc) | FIFO_WRITEL_REG, val);
+                        if (!voodoo->voodoo_busy)
+                                voodoo_wake_fifo_threads(voodoo->set, voodoo);
+                        break;
+                        
+                        case SST_swapPending:
+                        thread_lock_mutex(voodoo->swap_mutex);
+                        voodoo->swap_count++;
+                        thread_unlock_mutex(voodoo->swap_mutex);
+//                        voodoo->cmd_written++;
+                        break;
+                        
+                        default:
+                        voodoo_queue_command(voodoo, (addr & 0x3ffffc) | FIFO_WRITEL_REG, val);
+                        break;
+                }
+                break;
+                
+                case 0x0600000: case 0x0700000: /*Texture download*/
+                voodoo->tex_count++;
+                voodoo_queue_command(voodoo, (addr & 0x1ffffc) | FIFO_WRITEL_TEX, val);
+                break;
+                
+                case 0x1000000: case 0x1100000: case 0x1200000: case 0x1300000: /*3D LFB*/
+                case 0x1400000: case 0x1500000: case 0x1600000: case 0x1700000:
+                case 0x1800000: case 0x1900000: case 0x1a00000: case 0x1b00000:
+                case 0x1c00000: case 0x1d00000: case 0x1e00000: case 0x1f00000:
+                voodoo_queue_command(voodoo, (addr & 0xfffffc) | FIFO_WRITEL_FB, val);
+                break;
+        }
+}
+
+static uint8_t banshee_read_linear(uint32_t addr, void *p)
+{
+        banshee_t *banshee = (banshee_t *)p;
+        voodoo_t *voodoo = banshee->voodoo;
+        svga_t *svga = &banshee->svga;
+        
+        cycles -= voodoo->read_time;
+        cycles_lost += voodoo->read_time;
+
+        addr &= svga->decode_mask;
+        if (addr >= voodoo->tile_base)
+        {
+                int x, y;
+
+                addr -= voodoo->tile_base;
+                x = addr & (voodoo->tile_stride-1);
+                y = addr >> voodoo->tile_stride_shift;
+
+                addr = voodoo->tile_base + (x & 127) + ((x >> 7) * 128*32) + ((y & 31) * 128) + (y >> 5)*voodoo->tile_x_real;
+//                pclog("  Tile rb %08x->%08x %i %i\n", old_addr, addr, x, y);
+        }
+        if (addr >= svga->vram_max)
+                return 0xff;
+
+        egareads++;
+        cycles -= video_timing_read_b;
+        cycles_lost += video_timing_read_b;
+        
+//        pclog("read_linear: addr=%08x val=%02x\n", addr, svga->vram[addr & svga->vram_mask]);
+
+        return svga->vram[addr & svga->vram_mask];
+}
+
+static uint16_t banshee_read_linear_w(uint32_t addr, void *p)
+{
+        banshee_t *banshee = (banshee_t *)p;
+        voodoo_t *voodoo = banshee->voodoo;
+        svga_t *svga = &banshee->svga;
+        
+        if (addr & 1)
+                return banshee_read_linear(addr, p) | (banshee_read_linear(addr+1, p) << 8);
+
+        cycles -= voodoo->read_time;
+        cycles_lost += voodoo->read_time;
+
+        addr &= svga->decode_mask;
+        if (addr >= voodoo->tile_base)
+        {
+                int x, y;
+
+                addr -= voodoo->tile_base;
+                x = addr & (voodoo->tile_stride-1);
+                y = addr >> voodoo->tile_stride_shift;
+
+                addr = voodoo->tile_base + (x & 127) + ((x >> 7) * 128*32) + ((y & 31) * 128) + (y >> 5)*voodoo->tile_x_real;
+//                pclog("  Tile rb %08x->%08x %i %i\n", old_addr, addr, x, y);
+        }
+        if (addr >= svga->vram_max)
+                return 0xff;
+
+        egareads++;
+        cycles -= video_timing_read_w;
+        cycles_lost += video_timing_read_w;
+
+//        pclog("read_linear: addr=%08x val=%02x\n", addr, svga->vram[addr & svga->vram_mask]);
+
+        return *(uint16_t *)&svga->vram[addr & svga->vram_mask];
+}
+
+static uint32_t banshee_read_linear_l(uint32_t addr, void *p)
+{
+        banshee_t *banshee = (banshee_t *)p;
+        voodoo_t *voodoo = banshee->voodoo;
+        svga_t *svga = &banshee->svga;
+        
+        if (addr & 3)
+                return banshee_read_linear_w(addr, p) | (banshee_read_linear_w(addr+2, p) << 16);
+
+        cycles -= voodoo->read_time;
+        cycles_lost += voodoo->read_time;
+
+        addr &= svga->decode_mask;
+        if (addr >= voodoo->tile_base)
+        {
+                int x, y;
+
+                addr -= voodoo->tile_base;
+                x = addr & (voodoo->tile_stride-1);
+                y = addr >> voodoo->tile_stride_shift;
+
+                addr = voodoo->tile_base + (x & 127) + ((x >> 7) * 128*32) + ((y & 31) * 128) + (y >> 5)*voodoo->tile_x_real;
+//                pclog("  Tile rb %08x->%08x %i %i\n", old_addr, addr, x, y);
+        }
+        if (addr >= svga->vram_max)
+                return 0xff;
+
+        egareads++;
+        cycles -= video_timing_read_l;
+        cycles_lost += video_timing_read_l;
+
+//        pclog("read_linear: addr=%08x val=%02x\n", addr, svga->vram[addr & svga->vram_mask]);
+
+        return *(uint32_t *)&svga->vram[addr & svga->vram_mask];
+}
+
+static void banshee_write_linear(uint32_t addr, uint8_t val, void *p)
+{
+        banshee_t *banshee = (banshee_t *)p;
+        voodoo_t *voodoo = banshee->voodoo;
+        svga_t *svga = &banshee->svga;
+        
+        cycles -= voodoo->write_time;
+        cycles_lost += voodoo->write_time;
+
+//        pclog("write_linear: addr=%08x val=%02x\n", addr, val);
+        addr &= svga->decode_mask;
+        if (addr >= voodoo->tile_base)
+        {
+                int x, y;
+
+                addr -= voodoo->tile_base;
+                x = addr & (voodoo->tile_stride-1);
+                y = addr >> voodoo->tile_stride_shift;
+
+                addr = voodoo->tile_base + (x & 127) + ((x >> 7) * 128*32) + ((y & 31) * 128) + (y >> 5)*voodoo->tile_x_real;
+//                pclog("  Tile b %08x->%08x %i %i\n", old_addr, addr, x, y);
+        }
+        if (addr >= svga->vram_max)
+                return;
+
+        egawrites++;
+
+        cycles -= video_timing_write_b;
+        cycles_lost += video_timing_write_b;
+
+        svga->changedvram[addr >> 12] = changeframecount;
+        svga->vram[addr & svga->vram_mask] = val;
+}
+
+static void banshee_write_linear_w(uint32_t addr, uint16_t val, void *p)
+{
+        banshee_t *banshee = (banshee_t *)p;
+        voodoo_t *voodoo = banshee->voodoo;
+        svga_t *svga = &banshee->svga;
+        
+        if (addr & 1)
+        {
+                banshee_write_linear(addr, val, p);
+                banshee_write_linear(addr + 1, val >> 8, p);
+                return;
+        }
+
+        cycles -= voodoo->write_time;
+        cycles_lost += voodoo->write_time;
+
+//        pclog("write_linear: addr=%08x val=%02x\n", addr, val);
+        addr &= svga->decode_mask;
+        if (addr >= voodoo->tile_base)
+        {
+                int x, y;
+
+                addr -= voodoo->tile_base;
+                x = addr & (voodoo->tile_stride-1);
+                y = addr >> voodoo->tile_stride_shift;
+
+                addr = voodoo->tile_base + (x & 127) + ((x >> 7) * 128*32) + ((y & 31) * 128) + (y >> 5)*voodoo->tile_x_real;
+//                pclog("  Tile b %08x->%08x %i %i\n", old_addr, addr, x, y);
+        }
+        if (addr >= svga->vram_max)
+                return;
+
+        egawrites++;
+
+        cycles -= video_timing_write_w;
+        cycles_lost += video_timing_write_w;
+
+        svga->changedvram[addr >> 12] = changeframecount;
+        *(uint16_t *)&svga->vram[addr & svga->vram_mask] = val;
+}
+
+static void banshee_write_linear_l(uint32_t addr, uint32_t val, void *p)
+{
+        banshee_t *banshee = (banshee_t *)p;
+        voodoo_t *voodoo = banshee->voodoo;
+        svga_t *svga = &banshee->svga;
+        int timing;
+
+        if (addr & 3)
+        {
+                banshee_write_linear_w(addr, val, p);
+                banshee_write_linear_w(addr + 2, val >> 16, p);
+                return;
+        }
+
+        if (addr == voodoo->last_write_addr+4)
+                timing = voodoo->burst_time;
+        else
+                timing = voodoo->write_time;
+        cycles -= timing;
+        cycles_lost += timing;
+        voodoo->last_write_addr = addr;
+
+//        /*if (val) */pclog("write_linear_l: addr=%08x val=%08x  %08x\n", addr, val, voodoo->tile_base);
+        addr &= svga->decode_mask;
+        if (addr >= voodoo->tile_base)
+        {
+                int x, y;
+
+                addr -= voodoo->tile_base;
+                x = addr & (voodoo->tile_stride-1);
+                y = addr >> voodoo->tile_stride_shift;
+                
+                addr = voodoo->tile_base + (x & 127) + ((x >> 7) * 128*32) + ((y & 31) * 128) + (y >> 5)*voodoo->tile_x_real;
+//                pclog("  Tile %08x->%08x->%08x->%08x %i %i  tile_x=%i\n", old_addr, addr_off, addr2, addr, x, y, voodoo->tile_x_real);
+        }
+
+        if (addr >= svga->vram_max)
+                return;
+
+        egawrites += 4;
+
+        cycles -= video_timing_write_l;
+        cycles_lost += video_timing_write_l;
+
+        svga->changedvram[addr >> 12] = changeframecount;
+        *(uint32_t *)&svga->vram[addr & svga->vram_mask] = val;
+        if (voodoo->cmdfifo_enabled && addr >= voodoo->cmdfifo_base && addr < voodoo->cmdfifo_end)
+        {
+//                pclog("CMDFIFO write %08x %08x  old amin=%08x amax=%08x hlcnt=%i depth_wr=%i rp=%08x\n", addr, val, voodoo->cmdfifo_amin, voodoo->cmdfifo_amax, voodoo->cmdfifo_holecount, voodoo->cmdfifo_depth_wr, voodoo->cmdfifo_rp);
+                if (addr == voodoo->cmdfifo_base && !voodoo->cmdfifo_holecount)
+                {
+//                        if (voodoo->cmdfifo_holecount)
+//                                fatal("CMDFIFO reset pointers while outstanding holes\n");
+                        /*Reset pointers*/
+                        voodoo->cmdfifo_amin = voodoo->cmdfifo_base;
+                        voodoo->cmdfifo_amax = voodoo->cmdfifo_base;
+                        voodoo->cmdfifo_depth_wr++;
+                        voodoo_wake_fifo_thread(voodoo);
+                }
+                else if (voodoo->cmdfifo_holecount)
+                {
+//                        if ((addr <= voodoo->cmdfifo_amin && voodoo->cmdfifo_amin != -4) || addr >= voodoo->cmdfifo_amax)
+//                                fatal("CMDFIFO holecount write outside of amin/amax - amin=%08x amax=%08x holecount=%i\n", voodoo->cmdfifo_amin, voodoo->cmdfifo_amax, voodoo->cmdfifo_holecount);
+//                        pclog("holecount %i\n", voodoo->cmdfifo_holecount);
+                        voodoo->cmdfifo_holecount--;
+                        if (!voodoo->cmdfifo_holecount)
+                        {
+                                /*Filled in holes, resume normal operation*/
+                                voodoo->cmdfifo_depth_wr += ((voodoo->cmdfifo_amax - voodoo->cmdfifo_amin) >> 2);
+                                voodoo->cmdfifo_amin = voodoo->cmdfifo_amax;
+                                voodoo_wake_fifo_thread(voodoo);
+//                                pclog("hole filled! amin=%08x amax=%08x added %i words\n", voodoo->cmdfifo_amin, voodoo->cmdfifo_amax, words_to_add);
+                        }
+                }
+                else if (addr == voodoo->cmdfifo_amax+4)
+                {
+                        /*In-order write*/
+                        voodoo->cmdfifo_amin = addr;
+                        voodoo->cmdfifo_amax = addr;
+                        voodoo->cmdfifo_depth_wr++;
+                        voodoo_wake_fifo_thread(voodoo);
+                }
+                else
+                {
+                        /*Out-of-order write*/
+                        if (addr < voodoo->cmdfifo_amin)
+                        {
+                                /*Reset back to start. Note that write is still out of order!*/
+                                voodoo->cmdfifo_amin = voodoo->cmdfifo_base-4;
+
+                        }
+//                        else if (addr < voodoo->cmdfifo_amax)
+//                                fatal("Out-of-order write really out of order\n");
+                        voodoo->cmdfifo_amax = addr;
+                        voodoo->cmdfifo_holecount = ((voodoo->cmdfifo_amax - voodoo->cmdfifo_amin) >> 2) - 1;
+//                        pclog("CMDFIFO out of order: amin=%08x amax=%08x holecount=%i\n", voodoo->cmdfifo_amin, voodoo->cmdfifo_amax, voodoo->cmdfifo_holecount);
+                }
+        }
+}
+
+void banshee_hwcursor_draw(svga_t *svga, int displine)
+{
+        banshee_t *banshee = (banshee_t *)svga->p;
+        int x, c;
+        int x_off;
+        uint32_t col0 = banshee->hwCurC0;
+        uint32_t col1 = banshee->hwCurC1;
+        uint8_t plane0[8], plane1[8];
+
+        for (c = 0; c < 8; c++)
+                plane0[c] = svga->vram[svga->hwcursor_latch.addr + c];
+        for (c = 0; c < 8; c++)
+                plane1[c] = svga->vram[svga->hwcursor_latch.addr + c + 8];
+        svga->hwcursor_latch.addr += 16;
+        
+        x_off = svga->hwcursor_latch.x;
+        
+        if (banshee->vidProcCfg & VIDPROCCFG_CURSOR_MODE)
+        {
+                /*X11 mode*/
+                for (x = 0; x < 64; x += 8)
+                {
+                        if (x_off > (32-8))
+                        {
+                                int xx;
+
+                                for (xx = 0; xx < 8; xx++)
+                                {
+                                        if (plane0[x >> 3] & (1 << 7))
+                                                ((uint32_t *)buffer32->line[displine])[x_off + xx] = (plane1[x >> 3] & (1 << 7)) ? col1 : col0;
+
+                                        plane0[x >> 3] <<= 1;
+                                        plane1[x >> 3] <<= 1;
+                                }
+                        }
+
+                        x_off += 8;
+                }
+        }
+        else
+        {
+                /*Windows mode*/
+                for (x = 0; x < 64; x += 8)
+                {
+                        if (x_off > (32-8))
+                        {
+                                int xx;
+
+                                for (xx = 0; xx < 8; xx++)
+                                {
+                                        if (!(plane0[x >> 3] & (1 << 7)))
+                                                ((uint32_t *)buffer32->line[displine])[x_off + xx] = (plane1[x >> 3] & (1 << 7)) ? col1 : col0;
+                                        else if (plane1[x >> 3] & (1 << 7))
+                                                ((uint32_t *)buffer32->line[displine])[x_off + xx] ^= 0xffffff;
+
+                                        plane0[x >> 3] <<= 1;
+                                        plane1[x >> 3] <<= 1;
+                                }
+                        }
+
+                        x_off += 8;
+                }
+        }
+}
+
+#define CLAMP(x) do                                     \
+        {                                               \
+                if ((x) & ~0xff)                        \
+                        x = ((x) < 0) ? 0 : 0xff;       \
+        }                               \
+        while (0)
+
+#define DECODE_RGB565(buf)                                              \
+        do                                                              \
+        {                                                               \
+                int c;                                                  \
+                int wp = 0;                                             \
+                                                                        \
+                for (c = 0; c < voodoo->overlay.overlay_bytes; c += 2)  \
+                {                                                       \
+                        uint16_t data = *(uint16_t *)src;               \
+                        int r = data & 0x1f;                            \
+                        int g = (data >> 5) & 0x3f;                     \
+                        int b = data >> 11;                             \
+                                                                        \
+                        if (banshee->vidProcCfg & VIDPROCCFG_OVERLAY_CLUT_BYPASS) \
+                                buf[wp++] = (r << 3) | (g << 10) | (b << 19); \
+                        else                                            \
+                                buf[wp++] = (clut[r << 3] & 0x0000ff) | \
+                                            (clut[g << 2] & 0x00ff00) | \
+                                            (clut[b << 3] & 0xff0000);  \
+                        src += 2;                                       \
+                }                                                       \
+        } while (0)
+
+#define DECODE_RGB565_TILED(buf)                                        \
+        do                                                              \
+        {                                                               \
+                int c;                                                  \
+                int wp = 0;                                             \
+                uint32_t base_addr = (buf == banshee->overlay_buffer[1]) ? src_addr2 : src_addr;        \
+                                                                        \
+                for (c = 0; c < voodoo->overlay.overlay_bytes; c += 2) \
+                {                                                       \
+                        uint16_t data = *(uint16_t *)&svga->vram[(base_addr + (c & 127) + (c >> 7)*128*32) & svga->vram_mask];               \
+                        int r = data & 0x1f;                            \
+                        int g = (data >> 5) & 0x3f;                     \
+                        int b = data >> 11;                             \
+                                                                        \
+                        if (banshee->vidProcCfg & VIDPROCCFG_OVERLAY_CLUT_BYPASS) \
+                                buf[wp++] = (r << 3) | (g << 10) | (b << 19); \
+                        else                                            \
+                                buf[wp++] = (clut[r << 3] & 0x0000ff) | \
+                                            (clut[g << 2] & 0x00ff00) | \
+                                            (clut[b << 3] & 0xff0000);  \
+                }                                                       \
+        } while (0)
+
+#define DECODE_YUYV422(buf)                                             \
+        do                                                              \
+        {                                                               \
+                int c;                                                  \
+                int wp = 0;                                             \
+                                                                        \
+                for (c = 0; c < voodoo->overlay.overlay_bytes; c += 4)  \
+                {                                                       \
+                        uint8_t y1, y2;                                 \
+                        int8_t Cr, Cb;                                  \
+                        int dR, dG, dB;                                 \
+                        int r, g, b;                                    \
+                                                                        \
+                        y1 = src[0];                                    \
+                        Cr = src[1] - 0x80;                             \
+                        y2 = src[2];                                    \
+                        Cb = src[3] - 0x80;                             \
+                        src += 4;                                       \
+                                                                        \
+                        dR = (359*Cr) >> 8;                             \
+                        dG = (88*Cb + 183*Cr) >> 8;                     \
+                        dB = (453*Cb) >> 8;                             \
+                                                                        \
+                        r = y1 + dR;                                    \
+                        CLAMP(r);                                       \
+                        g = y1 - dG;                                    \
+                        CLAMP(g);                                       \
+                        b = y1 + dB;                                    \
+                        CLAMP(b);                                       \
+                        buf[wp++] = r | (g << 8) | (b << 16); \
+                                                                        \
+                        r = y2 + dR;                                    \
+                        CLAMP(r);                                       \
+                        g = y2 - dG;                                    \
+                        CLAMP(g);                                       \
+                        b = y2 + dB;                                    \
+                        CLAMP(b);                                       \
+                        buf[wp++] = r | (g << 8) | (b << 16); \
+                }                                                       \
+        } while (0)
+
+#define DECODE_UYUV422(buf)                                             \
+        do                                                              \
+        {                                                               \
+                int c;                                                  \
+                int wp = 0;                                             \
+                                                                        \
+                for (c = 0; c < voodoo->overlay.overlay_bytes; c += 4)  \
+                {                                                       \
+                        uint8_t y1, y2;                                 \
+                        int8_t Cr, Cb;                                  \
+                        int dR, dG, dB;                                 \
+                        int r, g, b;                                    \
+                                                                        \
+                        Cr = src[0] - 0x80;                             \
+                        y1 = src[1];                                    \
+                        Cb = src[2] - 0x80;                             \
+                        y2 = src[3];                                    \
+                        src += 4;                                       \
+                                                                        \
+                        dR = (359*Cr) >> 8;                             \
+                        dG = (88*Cb + 183*Cr) >> 8;                     \
+                        dB = (453*Cb) >> 8;                             \
+                                                                        \
+                        r = y1 + dR;                                    \
+                        CLAMP(r);                                       \
+                        g = y1 - dG;                                    \
+                        CLAMP(g);                                       \
+                        b = y1 + dB;                                    \
+                        CLAMP(b);                                       \
+                        buf[wp++] = r | (g << 8) | (b << 16); \
+                                                                        \
+                        r = y2 + dR;                                    \
+                        CLAMP(r);                                       \
+                        g = y2 - dG;                                    \
+                        CLAMP(g);                                       \
+                        b = y2 + dB;                                    \
+                        CLAMP(b);                                       \
+                        buf[wp++] = r | (g << 8) | (b << 16); \
+                }                                                       \
+        } while (0)
+
+
+#define OVERLAY_SAMPLE(buf)                     \
+        do                                      \
+        {                                       \
+                switch (banshee->overlay_pix_fmt)       \
+                {                                       \
+                        case 0:                         \
+                        break;                          \
+                                                        \
+                        case OVERLAY_FMT_YUYV422:       \
+                        DECODE_YUYV422(buf);            \
+                        break;                          \
+                                                        \
+                        case OVERLAY_FMT_UYVY422:       \
+                        DECODE_UYUV422(buf);            \
+                        break;                          \
+                                                        \
+                        case OVERLAY_FMT_565:           \
+                        case OVERLAY_FMT_565_DITHER:    \
+                        if (banshee->vidProcCfg & VIDPROCCFG_OVERLAY_TILE)      \
+                                DECODE_RGB565_TILED(buf);                       \
+                        else                                                    \
+                                DECODE_RGB565(buf);                             \
+                        break;                          \
+                }                                       \
+        } while (0)
+
+/* generate both filters for the static table here */
+void voodoo_generate_vb_filters(voodoo_t *voodoo, int fcr, int fcg)
+{
+        int g, h;
+        float difference, diffg;
+        float thiscol, thiscolg;
+       float clr, clg = 0;
+       float hack = 1.0f;
+       // pre-clamping
+
+       fcr *= hack;
+       fcg *= hack;
+
+
+       /* box prefilter */
+        for (g=0;g<256;g++)            // pixel 1 - our target pixel we want to bleed into
+        {
+               for (h=0;h<256;h++)      // pixel 2 - our main pixel
+               {
+                       float avg;
+                       float avgdiff;
+
+                       difference = (float)(g - h);
+                       avg = g;
+                       avgdiff = avg - h;
+
+                       avgdiff = avgdiff * 0.75f;
+                       if (avgdiff < 0) avgdiff *= -1;
+                       if (difference < 0) difference *= -1;
+
+                       thiscol = thiscolg = g;
+
+                       if (h > g)
+                       {
+                               clr = clg = avgdiff;
+
+                               if (clr>fcr) clr=fcr;
+                                if (clg>fcg) clg=fcg;
+
+                               thiscol = g;
+                               thiscolg = g;
+
+                               if (thiscol>g+fcr)
+                                       thiscol=g+fcr;
+                               if (thiscolg>g+fcg)
+                                       thiscolg=g+fcg;
+
+                               if (thiscol>g+difference)
+                                       thiscol=g+difference;
+                               if (thiscolg>g+difference)
+                                       thiscolg=g+difference;
+
+                               // hmm this might not be working out..
+                               int ugh = g - h;
+                               if (ugh < fcr)
+                                       thiscol = h;
+                               if (ugh < fcg)
+                                       thiscolg = h;
+                       }
+
+                       if (difference > fcr)
+                               thiscol = g;
+                       if (difference > fcg)
+                               thiscolg = g;
+
+                       // clamp
+                       if (thiscol < 0) thiscol = 0;
+                       if (thiscolg < 0) thiscolg = 0;
+
+                       if (thiscol > 255) thiscol = 255;
+                       if (thiscolg > 255) thiscolg = 255;
+
+                       vb_filter_bx_rb[g][h] = (thiscol);
+                       vb_filter_bx_g [g][h] = (thiscolg);
+
+                }
+               float lined = g + 4;
+                if (lined > 255)
+                        lined = 255;
+                voodoo->purpleline[g][0] = lined;
+                voodoo->purpleline[g][2] = lined;
+
+                lined = g + 0;
+                if (lined > 255)
+                        lined = 255;
+                voodoo->purpleline[g][1] = lined;
+        }
+
+        /* 4x1 and 2x2 filter */
+       //fcr *= 5;
+       //fcg *= 6;
+
+        for (g=0;g<256;g++)         // pixel 1
+        {
+                for (h=0;h<256;h++)      // pixel 2
+                {
+                        difference = (float)(h - g);
+                        diffg = difference;
+
+                       thiscol = thiscolg =  g;
+
+                        if (difference > fcr)
+                                difference = fcr;
+                        if (difference < -fcr)
+                                difference = -fcr;
+
+                        if (diffg > fcg)
+                                diffg = fcg;
+                        if (diffg < -fcg)
+                                diffg = -fcg;
+
+                       if ((difference < fcr) || (-difference > -fcr))
+                                thiscol =  g + (difference / 2);
+                       if ((diffg < fcg) || (-diffg > -fcg))
+                                thiscolg =  g + (diffg / 2);
+
+                        if (thiscol < 0)
+                                thiscol = 0;
+                        if (thiscol > 255)
+                                thiscol = 255;
+
+                        if (thiscolg < 0)
+                                thiscolg = 0;
+                        if (thiscolg > 255)
+                                thiscolg = 255;
+
+                        vb_filter_v1_rb[g][h] = thiscol;
+                        vb_filter_v1_g [g][h] = thiscolg;
+
+                }
+        }
+
+}
+
+
+static void banshee_overlay_draw(svga_t *svga, int displine)
+{
+        banshee_t *banshee = (banshee_t *)svga->p;
+        voodoo_t *voodoo = banshee->voodoo;
+        uint32_t *p;
+        int x;
+        int y = voodoo->overlay.src_y >> 20;
+        uint32_t src_addr = svga->overlay_latch.addr + ((banshee->vidProcCfg & VIDPROCCFG_OVERLAY_TILE) ?
+                ((y & 31) * 128 + (y >> 5) * svga->overlay_latch.pitch) :
+                y * svga->overlay_latch.pitch);
+        uint32_t src_addr2 = svga->overlay_latch.addr + ((banshee->vidProcCfg & VIDPROCCFG_OVERLAY_TILE) ?
+                (((y + 1) & 31) * 128 + ((y + 1) >> 5) * svga->overlay_latch.pitch) :
+                (y + 1) * svga->overlay_latch.pitch);
+        uint8_t *src = &svga->vram[src_addr & svga->vram_mask];
+        uint32_t src_x = 0;
+        unsigned int y_coeff = (voodoo->overlay.src_y & 0xfffff) >> 4;
+        int skip_filtering;
+        uint32_t *clut = &svga->pallook[(banshee->vidProcCfg & VIDPROCCFG_OVERLAY_CLUT_SEL) ? 256 : 0];
+
+        if (svga->render == svga_render_null &&
+                        !svga->changedvram[src_addr >> 12] && !svga->changedvram[src_addr2 >> 12] &&
+                        !svga->fullchange &&
+                        ((voodoo->overlay.src_y >> 20) < 2048 && !voodoo->dirty_line[voodoo->overlay.src_y >> 20]) &&
+                        !(banshee->vidProcCfg & VIDPROCCFG_V_SCALE_ENABLE))
+        {
+                voodoo->overlay.src_y += (1 << 20);
+                return;
+        }
+
+        if ((voodoo->overlay.src_y >> 20) < 2048)
+                voodoo->dirty_line[voodoo->overlay.src_y >> 20] = 0;
+//        pclog("displine=%i addr=%08x %08x  %08x  %08x\n", displine, svga->overlay_latch.addr, src_addr, voodoo->overlay.vidOverlayDvdy, *(uint32_t *)src);
+//        if (src_addr >= 0x800000)
+//                fatal("overlay out of range!\n");
+        p = &((uint32_t *)buffer32->line[displine])[svga->overlay_latch.x + 32];
+
+        if (banshee->voodoo->scrfilter && banshee->voodoo->scrfilterEnabled)
+                skip_filtering = ((banshee->vidProcCfg & VIDPROCCFG_FILTER_MODE_MASK) != VIDPROCCFG_FILTER_MODE_BILINEAR &&
+                            !(banshee->vidProcCfg & VIDPROCCFG_H_SCALE_ENABLE) && !(banshee->vidProcCfg & VIDPROCCFG_FILTER_MODE_DITHER_4X4) &&
+                            !(banshee->vidProcCfg & VIDPROCCFG_FILTER_MODE_DITHER_2X2));
+        else
+                skip_filtering = ((banshee->vidProcCfg & VIDPROCCFG_FILTER_MODE_MASK) != VIDPROCCFG_FILTER_MODE_BILINEAR &&
+                                !(banshee->vidProcCfg & VIDPROCCFG_H_SCALE_ENABLE));
+
+        if (skip_filtering)
+        {
+                /*No scaling or filtering required, just write straight to output buffer*/
+                OVERLAY_SAMPLE(p);
+        }
+        else
+        {
+                OVERLAY_SAMPLE(banshee->overlay_buffer[0]);
+
+                switch (banshee->vidProcCfg & VIDPROCCFG_FILTER_MODE_MASK)
+                {
+                        case VIDPROCCFG_FILTER_MODE_BILINEAR:
+                        src = &svga->vram[src_addr2 & svga->vram_mask];
+                        OVERLAY_SAMPLE(banshee->overlay_buffer[1]);
+                        if (banshee->vidProcCfg & VIDPROCCFG_H_SCALE_ENABLE)
+                        {
+                                for (x = 0; x < svga->overlay_latch.xsize; x++)
+                                {
+                                        unsigned int x_coeff = (src_x & 0xfffff) >> 4;
+                                        unsigned int coeffs[4] = {
+                                                ((0x10000 - x_coeff) * (0x10000 - y_coeff)) >> 16,
+                                                (           x_coeff  * (0x10000 - y_coeff)) >> 16,
+                                                ((0x10000 - x_coeff) *            y_coeff) >> 16,
+                                                (           x_coeff  *            y_coeff) >> 16
+                                        };
+                                        uint32_t samp0 = banshee->overlay_buffer[0][src_x >> 20];
+                                        uint32_t samp1 = banshee->overlay_buffer[0][(src_x >> 20) + 1];
+                                        uint32_t samp2 = banshee->overlay_buffer[1][src_x >> 20];
+                                        uint32_t samp3 = banshee->overlay_buffer[1][(src_x >> 20) + 1];
+                                        int r = (((samp0 >> 16) & 0xff) * coeffs[0] +
+                                                ((samp1 >> 16) & 0xff) * coeffs[1] +
+                                                ((samp2 >> 16) & 0xff) * coeffs[2] +
+                                                ((samp3 >> 16) & 0xff) * coeffs[3]) >> 16;
+                                        int g = (((samp0 >> 8) & 0xff) * coeffs[0] +
+                                                ((samp1 >> 8) & 0xff) * coeffs[1] +
+                                                ((samp2 >> 8) & 0xff) * coeffs[2] +
+                                                ((samp3 >> 8) & 0xff) * coeffs[3]) >> 16;
+                                        int b = ((samp0 & 0xff) * coeffs[0] +
+                                                (samp1 & 0xff) * coeffs[1] +
+                                                (samp2 & 0xff) * coeffs[2] +
+                                                (samp3 & 0xff) * coeffs[3]) >> 16;
+                                        p[x] = (r << 16) | (g << 8) | b;
+
+                                        src_x += voodoo->overlay.vidOverlayDudx;
+                                }
+                        }
+                        else
+                        {
+                                for (x = 0; x < svga->overlay_latch.xsize; x++)
+                                {
+                                        uint32_t samp0 = banshee->overlay_buffer[0][src_x >> 20];
+                                        uint32_t samp1 = banshee->overlay_buffer[1][src_x >> 20];
+                                        int r = (((samp0 >> 16) & 0xff) * (0x10000 - y_coeff) +
+                                                ((samp1 >> 16) & 0xff) * y_coeff) >> 16;
+                                        int g = (((samp0 >> 8) & 0xff) * (0x10000 - y_coeff) +
+                                                ((samp1 >> 8) & 0xff) * y_coeff) >> 16;
+                                        int b = ((samp0 & 0xff) * (0x10000 - y_coeff) +
+                                                (samp1 & 0xff) * y_coeff) >> 16;
+                                        p[x] = (r << 16) | (g << 8) | b;
+                                }
+                        }
+                        break;
+                
+                        case VIDPROCCFG_FILTER_MODE_DITHER_4X4:
+                        if (banshee->voodoo->scrfilter && banshee->voodoo->scrfilterEnabled)
+                        {
+                                uint8_t fil[(svga->overlay_latch.xsize) * 3];
+                                uint8_t fil3[(svga->overlay_latch.xsize) * 3];
+
+                                if (banshee->vidProcCfg & VIDPROCCFG_H_SCALE_ENABLE) /* leilei HACK - don't know of real 4x1 hscaled behavior yet, double for now */
+                                {
+                                        for (x=0; x<svga->overlay_latch.xsize;x++)
+                                        {
+                                                fil[x*3]       = ((banshee->overlay_buffer[0][src_x >> 20]));
+                                                fil[x*3+1]     = ((banshee->overlay_buffer[0][src_x >> 20] >> 8));
+                                                fil[x*3+2]     = ((banshee->overlay_buffer[0][src_x >> 20] >> 16));
+                                                fil3[x*3+0]    = fil[x*3+0];
+                                                fil3[x*3+1]    = fil[x*3+1];
+                                                fil3[x*3+2]    = fil[x*3+2];
+                                                src_x += voodoo->overlay.vidOverlayDudx;
+                                        }
+                                }
+                                else
+                                {
+                                        for (x=0; x<svga->overlay_latch.xsize;x++)
+                                        {
+                                                fil[x*3]       = ((banshee->overlay_buffer[0][x]));
+                                                fil[x*3+1]     = ((banshee->overlay_buffer[0][x] >> 8));
+                                                fil[x*3+2]     = ((banshee->overlay_buffer[0][x] >> 16));
+                                                fil3[x*3+0]    = fil[x*3+0];
+                                                fil3[x*3+1]    = fil[x*3+1];
+                                                fil3[x*3+2]    = fil[x*3+2];
+                                        }
+                                }
+                                if (y % 2 == 0)
+                                {
+                                        for (x=0; x<svga->overlay_latch.xsize;x++)
+                                        {
+                                                fil[x*3] = banshee->voodoo->purpleline[fil[x*3+0]][0];
+                                                fil[x*3+1] = banshee->voodoo->purpleline[fil[x*3+1]][1];
+                                                fil[x*3+2] = banshee->voodoo->purpleline[fil[x*3+2]][2];
+                                        }
+                                }
+
+                                for (x=1; x<svga->overlay_latch.xsize;x++)
+                                {
+                                        fil3[(x)*3]   = vb_filter_v1_rb [fil[x*3]]  [fil[(x-1) *3]];
+                                        fil3[(x)*3+1] = vb_filter_v1_g  [fil[x*3+1]][fil[(x-1) *3+1]];
+                                        fil3[(x)*3+2] = vb_filter_v1_rb [fil[x*3+2]] [fil[(x-1) *3+2]];
+                                }
+                                for (x=1; x<svga->overlay_latch.xsize;x++)
+                                {
+                                        fil[(x)*3]   = vb_filter_v1_rb [fil[x*3]]  [fil3[(x-1) *3]];
+                                        fil[(x)*3+1] = vb_filter_v1_g  [fil[x*3+1]][fil3[(x-1) *3+1]];
+                                        fil[(x)*3+2] = vb_filter_v1_rb [fil[x*3+2]] [fil3[(x-1) *3+2]];
+                                }
+                                for (x=1; x<svga->overlay_latch.xsize;x++)
+                                {
+                                        fil3[(x)*3]   = vb_filter_v1_rb [fil[x*3]]  [fil[(x-1) *3]];
+                                        fil3[(x)*3+1] = vb_filter_v1_g  [fil[x*3+1]][fil[(x-1) *3+1]];
+                                        fil3[(x)*3+2] = vb_filter_v1_rb [fil[x*3+2]] [fil[(x-1) *3+2]];
+                                }
+                                for (x=0; x<svga->overlay_latch.xsize;x++)
+                                {
+                                        fil[(x)*3]   = vb_filter_v1_rb [fil[x*3]]  [fil3[(x+1) *3]];
+                                        fil[(x)*3+1] = vb_filter_v1_g  [fil[x*3+1]][fil3[(x+1) *3+1]];
+                                        fil[(x)*3+2] = vb_filter_v1_rb [fil[x*3+2]] [fil3[(x+1) *3+2]];
+                                        p[x] = (fil[x*3+2] << 16) | (fil[x*3+1] << 8) | fil[x*3];
+                                }
+                        }
+                        else  /* filter disabled by emulator option */
+                        {
+                                if (banshee->vidProcCfg & VIDPROCCFG_H_SCALE_ENABLE)
+                                {
+                                        for (x = 0; x < svga->overlay_latch.xsize; x++)
+                                        {
+                                                p[x] = banshee->overlay_buffer[0][src_x >> 20];
+                                                src_x += voodoo->overlay.vidOverlayDudx;
+                                        }
+                                }
+                                else
+                                {
+                                        for (x = 0; x < svga->overlay_latch.xsize; x++)
+                                                p[x] = banshee->overlay_buffer[0][x];
+                                }
+                        }
+                        break;
+
+                        case VIDPROCCFG_FILTER_MODE_DITHER_2X2:
+                        if (banshee->voodoo->scrfilter && banshee->voodoo->scrfilterEnabled)
+                        {
+                                uint8_t fil[(svga->overlay_latch.xsize) * 3];
+                                uint8_t soak[(svga->overlay_latch.xsize) * 3];
+                                uint8_t soak2[(svga->overlay_latch.xsize) * 3];
+
+                                uint8_t samp1[(svga->overlay_latch.xsize) * 3];
+                                uint8_t samp2[(svga->overlay_latch.xsize) * 3];
+                                uint8_t samp3[(svga->overlay_latch.xsize) * 3];
+                                uint8_t samp4[(svga->overlay_latch.xsize) * 3];
+
+                                src = &svga->vram[src_addr2 & svga->vram_mask];
+                                OVERLAY_SAMPLE(banshee->overlay_buffer[1]);
+                                for (x=0; x<svga->overlay_latch.xsize;x++)
+                                {
+                                        samp1[x*3]     = ((banshee->overlay_buffer[0][x]));
+                                        samp1[x*3+1]   = ((banshee->overlay_buffer[0][x] >> 8));
+                                        samp1[x*3+2]   = ((banshee->overlay_buffer[0][x] >> 16));
+
+                                        samp2[x*3+0]   = ((banshee->overlay_buffer[0][x+1]));
+                                        samp2[x*3+1]   = ((banshee->overlay_buffer[0][x+1] >> 8));
+                                        samp2[x*3+2]   = ((banshee->overlay_buffer[0][x+1] >> 16));
+
+                                        samp3[x*3+0]   = ((banshee->overlay_buffer[1][x]));
+                                        samp3[x*3+1]   = ((banshee->overlay_buffer[1][x] >> 8));
+                                        samp3[x*3+2]   = ((banshee->overlay_buffer[1][x] >> 16));
+
+                                        samp4[x*3+0]   = ((banshee->overlay_buffer[1][x+1]));
+                                        samp4[x*3+1]   = ((banshee->overlay_buffer[1][x+1] >> 8));
+                                        samp4[x*3+2]   = ((banshee->overlay_buffer[1][x+1] >> 16));
+
+                                        /* sample two lines */
+
+                                        soak[x*3+0]   = vb_filter_bx_rb [samp1[x*3+0]]   [samp2[x*3+0]];
+                                        soak[x*3+1]   = vb_filter_bx_g  [samp1[x*3+1]]   [samp2[x*3+1]];
+                                        soak[x*3+2]   = vb_filter_bx_rb [samp1[x*3+2]]   [samp2[x*3+2]];
+
+                                        soak2[x*3+0]   = vb_filter_bx_rb[samp3[x*3+0]]   [samp4[x*3+0]];
+                                        soak2[x*3+1]   = vb_filter_bx_g [samp3[x*3+1]]   [samp4[x*3+1]];
+                                        soak2[x*3+2]   = vb_filter_bx_rb[samp3[x*3+2]]   [samp4[x*3+2]];
+
+                                        /* then pour it on the rest */
+
+                                        fil[x*3+0]   = vb_filter_v1_rb[soak[x*3+0]]   [soak2[x*3+0]];
+                                        fil[x*3+1]   = vb_filter_v1_g [soak[x*3+1]]   [soak2[x*3+1]];
+                                        fil[x*3+2]   = vb_filter_v1_rb[soak[x*3+2]]   [soak2[x*3+2]];
+                                }
+
+                                if (banshee->vidProcCfg & VIDPROCCFG_H_SCALE_ENABLE)  /* 2x2 on a scaled low res */
+                                {
+                                        for (x=0; x<svga->overlay_latch.xsize;x++)
+                                        {
+                                                p[x] = (fil[(src_x >> 20)*3+2] << 16) | (fil[(src_x >> 20)*3+1] << 8) | fil[(src_x >> 20)*3];
+                                                src_x += voodoo->overlay.vidOverlayDudx;
+                                        }
+                                }
+                                else
+                                {
+                                        for (x=0; x<svga->overlay_latch.xsize;x++)
+                                        {
+                                                p[x] = (fil[x*3+2] << 16) | (fil[x*3+1] << 8) | fil[x*3];
+                                        }
+                                }
+                        }
+                        else  /* filter disabled by emulator option */
+                        {
+                                if (banshee->vidProcCfg & VIDPROCCFG_H_SCALE_ENABLE)
+                                {
+                                        for (x = 0; x < svga->overlay_latch.xsize; x++)
+                                        {
+                                                p[x] = banshee->overlay_buffer[0][src_x >> 20];
+
+                                                src_x += voodoo->overlay.vidOverlayDudx;
+                                        }
+                                }
+                                else
+                                {
+                                        for (x = 0; x < svga->overlay_latch.xsize; x++)
+                                                p[x] = banshee->overlay_buffer[0][x];
+                                }
+                       }
+                        break;
+
+                        case VIDPROCCFG_FILTER_MODE_POINT:
+                        default:
+                        if (banshee->vidProcCfg & VIDPROCCFG_H_SCALE_ENABLE)
+                        {
+                                for (x = 0; x < svga->overlay_latch.xsize; x++)
+                                {
+                                        p[x] = banshee->overlay_buffer[0][src_x >> 20];
+
+                                        src_x += voodoo->overlay.vidOverlayDudx;
+                                }
+                        }
+                        else
+                        {
+                                for (x = 0; x < svga->overlay_latch.xsize; x++)
+                                        p[x] = banshee->overlay_buffer[0][x];
+                        }
+                        break;
+                }
+        }
+        
+        if (banshee->vidProcCfg & VIDPROCCFG_V_SCALE_ENABLE)
+                voodoo->overlay.src_y += voodoo->overlay.vidOverlayDvdy;
+        else
+                voodoo->overlay.src_y += (1 << 20);
+}
+
+void banshee_set_overlay_addr(void *p, uint32_t addr)
+{
+        banshee_t *banshee = (banshee_t *)p;
+        voodoo_t *voodoo = banshee->voodoo;
+        
+        banshee->svga.overlay.addr = banshee->voodoo->leftOverlayBuf & 0xfffffff;
+        banshee->svga.overlay_latch.addr = banshee->voodoo->leftOverlayBuf & 0xfffffff;
+        memset(voodoo->dirty_line, 1, sizeof(voodoo->dirty_line));
+}
+
+static void banshee_vsync_callback(svga_t *svga)
+{
+        banshee_t *banshee = (banshee_t *)svga->p;
+        voodoo_t *voodoo = banshee->voodoo;
+
+        voodoo->retrace_count++;
+        thread_lock_mutex(voodoo->swap_mutex);
+        if (voodoo->swap_pending && (voodoo->retrace_count > voodoo->swap_interval))
+        {
+                if (voodoo->swap_count > 0)
+                        voodoo->swap_count--;
+                voodoo->swap_pending = 0;
+                thread_unlock_mutex(voodoo->swap_mutex);
+
+                memset(voodoo->dirty_line, 1, sizeof(voodoo->dirty_line));
+                voodoo->retrace_count = 0;
+                banshee_set_overlay_addr(banshee, voodoo->swap_offset);
+                thread_set_event(voodoo->wake_fifo_thread);
+                voodoo->frame_count++;
+        }
+        else
+                thread_unlock_mutex(voodoo->swap_mutex);
+
+        voodoo->overlay.src_y = 0;
+        banshee->desktop_addr = banshee->vidDesktopStartAddr;
+        banshee->desktop_y = 0;
+}
+
+static uint8_t banshee_pci_read(int func, int addr, void *p)
+{
+        banshee_t *banshee = (banshee_t *)p;
+//        svga_t *svga = &banshee->svga;
+        uint8_t ret = 0;
+
+        if (func)
+                return 0xff;
+//        pclog("Banshee PCI read %08X  ", addr);
+        switch (addr)
+        {
+                case 0x00: ret = 0x1a; break; /*3DFX*/
+                case 0x01: ret = 0x12; break;
+                
+                case 0x02: ret = (banshee->type == TYPE_BANSHEE) ? 0x03 : 0x05; break;
+                case 0x03: ret = 0x00; break;
+
+                case 0x04: ret = banshee->pci_regs[0x04] & 0x27; break;
+                
+                case 0x07: ret = banshee->pci_regs[0x07] & 0x36; break;
+                                
+                case 0x08: ret = (banshee->type == TYPE_BANSHEE) ? 3 : 1; break; /*Revision ID*/
+                case 0x09: ret = 0; break; /*Programming interface*/
+                
+                case 0x0a: ret = 0x00; break; /*Supports VGA interface*/
+                case 0x0b: ret = 0x03; /*output = 3; */break;
+
+                case 0x0d: ret = banshee->pci_regs[0x0d] & 0xf8; break;
+                                
+                case 0x10: ret = 0x00; break; /*memBaseAddr0*/
+                case 0x11: ret = 0x00; break;
+                case 0x12: ret = 0x00; break;
+                case 0x13: ret = banshee->memBaseAddr0 >> 24; break;
+
+                case 0x14: ret = 0x00; break; /*memBaseAddr1*/
+                case 0x15: ret = 0x00; break;
+                case 0x16: ret = 0x00; break;
+                case 0x17: ret = banshee->memBaseAddr1 >> 24; break;
+                
+                case 0x18: ret = 0x01; break; /*ioBaseAddr*/
+                case 0x19: ret = banshee->ioBaseAddr >> 8; break;
+                case 0x1a: ret = 0x00; break;
+                case 0x1b: ret = 0x00; break;
+
+                /*Subsystem vendor ID*/
+                case 0x2c: ret = banshee->pci_regs[0x2c]; break;
+                case 0x2d: ret = banshee->pci_regs[0x2d]; break;
+                case 0x2e: ret = banshee->pci_regs[0x2e]; break;
+                case 0x2f: ret = banshee->pci_regs[0x2f]; break;
+
+                case 0x30: ret = banshee->pci_regs[0x30] & 0x01; break; /*BIOS ROM address*/
+                case 0x31: ret = 0x00; break;
+                case 0x32: ret = banshee->pci_regs[0x32]; break;
+                case 0x33: ret = banshee->pci_regs[0x33]; break;
+
+                case 0x3c: ret = banshee->pci_regs[0x3c]; break;
+                                
+                case 0x3d: ret = 0x01; break; /*INTA*/
+                
+                case 0x3e: ret = 0x04; break;
+                case 0x3f: ret = 0xff; break;
+                
+        }
+//        pclog("%02X\n", ret);
+        return ret;
+}
+
+static void banshee_pci_write(int func, int addr, uint8_t val, void *p)
+{
+        banshee_t *banshee = (banshee_t *)p;
+//        svga_t *svga = &banshee->svga;
+
+        if (func)
+                return;
+//        pclog("Banshee write %08X %02X %04X:%08X\n", addr, val, CS, cpu_state.pc);
+        switch (addr)
+        {
+                case 0x00: case 0x01: case 0x02: case 0x03:
+                case 0x08: case 0x09: case 0x0a: case 0x0b:
+                case 0x3d: case 0x3e: case 0x3f:
+                return;
+                
+                case PCI_REG_COMMAND:
+                if (val & PCI_COMMAND_IO)
+                {
+                        io_removehandler(0x03c0, 0x0020, banshee_in, NULL, NULL, banshee_out, NULL, NULL, banshee);
+                        if (banshee->ioBaseAddr)
+                                io_removehandler(banshee->ioBaseAddr, 0x0100, banshee_ext_in, NULL, banshee_ext_inl, banshee_ext_out, NULL, banshee_ext_outl, banshee);
+
+                        io_sethandler(0x03c0, 0x0020, banshee_in, NULL, NULL, banshee_out, NULL, NULL, banshee);
+                        if (banshee->ioBaseAddr)
+                                io_sethandler(banshee->ioBaseAddr, 0x0100, banshee_ext_in, NULL, banshee_ext_inl, banshee_ext_out, NULL, banshee_ext_outl, banshee);
+                }
+                else
+                {
+                        io_removehandler(0x03c0, 0x0020, banshee_in, NULL, NULL, banshee_out, NULL, NULL, banshee);
+                        io_removehandler(banshee->ioBaseAddr, 0x0100, banshee_ext_in, NULL, banshee_ext_inl, banshee_ext_out, NULL, banshee_ext_outl, banshee);
+                }
+                banshee->pci_regs[PCI_REG_COMMAND] = val & 0x27;
+                banshee_updatemapping(banshee);
+                return;
+                case 0x07:
+                banshee->pci_regs[0x07] = val & 0x3e;
+                return;
+                case 0x0d: 
+                banshee->pci_regs[0x0d] = val & 0xf8;
+                return;
+                
+                case 0x13:
+                banshee->memBaseAddr0 = (val & 0xfe) << 24;
+                banshee_updatemapping(banshee);
+                return;
+
+                case 0x17:
+                banshee->memBaseAddr1 = (val & 0xfe) << 24;
+                banshee_updatemapping(banshee);
+                return;
+
+                case 0x19:
+                if (banshee->pci_regs[PCI_REG_COMMAND] & PCI_COMMAND_IO)
+                        io_removehandler(banshee->ioBaseAddr, 0x0100, banshee_ext_in, NULL, banshee_ext_inl, banshee_ext_out, NULL, banshee_ext_outl, banshee);
+                banshee->ioBaseAddr = val << 8;
+                if ((banshee->pci_regs[PCI_REG_COMMAND] & PCI_COMMAND_IO) && banshee->ioBaseAddr)
+                        io_sethandler(banshee->ioBaseAddr, 0x0100, banshee_ext_in, NULL, banshee_ext_inl, banshee_ext_out, NULL, banshee_ext_outl, banshee);
+                pclog("Banshee ioBaseAddr=%08x\n", banshee->ioBaseAddr);
+//                s3_virge_updatemapping(virge); 
+                return;
+
+                case 0x30: case 0x32: case 0x33:
+                banshee->pci_regs[addr] = val;
+                if (banshee->pci_regs[0x30] & 0x01)
+                {
+                        uint32_t addr = (banshee->pci_regs[0x32] << 16) | (banshee->pci_regs[0x33] << 24);
+                        pclog("Banshee bios_rom enabled at %08x\n", addr);
+                        mem_mapping_set_addr(&banshee->bios_rom.mapping, addr, 0x10000);
+                        mem_mapping_enable(&banshee->bios_rom.mapping);
+                }
+                else
+                {
+                        pclog("Banshee bios_rom disabled\n");
+                        mem_mapping_disable(&banshee->bios_rom.mapping);
+                }
+                return;
+                case 0x3c: 
+                banshee->pci_regs[0x3c] = val;
+                return;
+        }
+}
+
+static device_config_t banshee_sgram_config[] =
+{
+        {
+                .name = "memory",
+                .description = "Memory size",
+                .type = CONFIG_SELECTION,
+                .selection =
+                {
+                        {
+                                .description = "8 MB",
+                                .value = 8
+                        },
+                        {
+                                .description = "16 MB",
+                                .value = 16
+                        },
+                        {
+                                .description = ""
+                        }
+                },
+                .default_int = 16
+        },
+        {
+                .name = "bilinear",
+                .description = "Bilinear filtering",
+                .type = CONFIG_BINARY,
+                .default_int = 1
+        },
+        {
+                .name = "dacfilter",
+                .description = "Screen Filter",
+                .type = CONFIG_BINARY,
+                .default_int = 0
+        },
+        {
+                .name = "render_threads",
+                .description = "Render threads",
+                .type = CONFIG_SELECTION,
+                .selection =
+                {
+                        {
+                                .description = "1",
+                                .value = 1
+                        },
+                        {
+                                .description = "2",
+                                .value = 2
+                        },
+                        {
+                                .description = "4",
+                                .value = 4
+                        },
+                        {
+                                .description = ""
+                        }
+                },
+                .default_int = 2
+        },
+#ifndef NO_CODEGEN
+        {
+                .name = "recompiler",
+                .description = "Recompiler",
+                .type = CONFIG_BINARY,
+                .default_int = 1
+        },
+#endif
+        {
+                .type = -1
+        }
+};
+
+static device_config_t banshee_sdram_config[] =
+{
+        {
+                .name = "bilinear",
+                .description = "Bilinear filtering",
+                .type = CONFIG_BINARY,
+                .default_int = 1
+        },
+        {
+                .name = "dacfilter",
+                .description = "Screen Filter",
+                .type = CONFIG_BINARY,
+                .default_int = 0
+        },
+        {
+                .name = "render_threads",
+                .description = "Render threads",
+                .type = CONFIG_SELECTION,
+                .selection =
+                {
+                        {
+                                .description = "1",
+                                .value = 1
+                        },
+                        {
+                                .description = "2",
+                                .value = 2
+                        },
+                        {
+                                .description = "4",
+                                .value = 4
+                        },
+                        {
+                                .description = ""
+                        }
+                },
+                .default_int = 2
+        },
+#ifndef NO_CODEGEN
+        {
+                .name = "recompiler",
+                .description = "Recompiler",
+                .type = CONFIG_BINARY,
+                .default_int = 1
+        },
+#endif
+        {
+                .type = -1
+        }
+};
+
+static void *banshee_init_common(char *fn, int has_sgram, int type, int voodoo_type)
+{
+        int mem_size;
+        banshee_t *banshee = malloc(sizeof(banshee_t));
+        memset(banshee, 0, sizeof(banshee_t));
+        
+        banshee->type = type;
+
+        rom_init(&banshee->bios_rom, fn, 0xc0000, 0x10000, 0xffff, 0, MEM_MAPPING_EXTERNAL);
+        mem_mapping_disable(&banshee->bios_rom.mapping);
+
+        if (has_sgram)
+                mem_size = device_get_config_int("memory");
+        else
+                mem_size = 16; /*SDRAM Banshee only supports 16 MB*/
+
+        svga_init(&banshee->svga, banshee, mem_size << 20,
+                   banshee_recalctimings,
+                   banshee_in, banshee_out,
+                   banshee_hwcursor_draw,
+                   banshee_overlay_draw);
+        banshee->svga.vsync_callback = banshee_vsync_callback;
+
+        mem_mapping_add(&banshee->linear_mapping, 0, 0, banshee_read_linear,
+                                                        banshee_read_linear_w,
+                                                        banshee_read_linear_l,
+                                                        banshee_write_linear,
+                                                        banshee_write_linear_w,
+                                                        banshee_write_linear_l,
+                                                        NULL,
+                                                        MEM_MAPPING_EXTERNAL,
+                                                        &banshee->svga);
+        mem_mapping_add(&banshee->reg_mapping_low, 0, 0,banshee_reg_read,
+                                                        banshee_reg_readw,
+                                                        banshee_reg_readl,
+                                                        banshee_reg_write,
+                                                        banshee_reg_writew,
+                                                        banshee_reg_writel,
+                                                        NULL,
+                                                        MEM_MAPPING_EXTERNAL,
+                                                        banshee);
+        mem_mapping_add(&banshee->reg_mapping_high, 0,0,banshee_reg_read,
+                                                        banshee_reg_readw,
+                                                        banshee_reg_readl,
+                                                        banshee_reg_write,
+                                                        banshee_reg_writew,
+                                                        banshee_reg_writel,
+                                                        NULL,
+                                                        MEM_MAPPING_EXTERNAL,
+                                                        banshee);
+
+//        io_sethandler(0x03c0, 0x0020, banshee_in, NULL, NULL, banshee_out, NULL, NULL, banshee);
+
+        banshee->svga.bpp = 8;
+        banshee->svga.miscout = 1;
+        
+        banshee->dramInit0 = 1 << 27;
+        if (has_sgram && mem_size == 16)
+                banshee->dramInit0 |= (1 << 26); /*2xSGRAM = 16 MB*/
+        if (!has_sgram)
+                banshee->dramInit1 = 1 << 30; /*SDRAM*/
+        banshee->svga.decode_mask = 0x1ffffff;
+
+        pci_add(banshee_pci_read, banshee_pci_write, banshee);
+        
+        banshee->voodoo = voodoo_2d3d_card_init(voodoo_type);
+        banshee->voodoo->p = banshee;
+        banshee->voodoo->vram = banshee->svga.vram;
+        banshee->voodoo->changedvram = banshee->svga.changedvram;
+        banshee->voodoo->fb_mem = banshee->svga.vram;
+        banshee->voodoo->fb_mask = banshee->svga.vram_mask;
+        banshee->voodoo->tex_mem[0] = banshee->svga.vram;
+        banshee->voodoo->tex_mem_w[0] = (uint16_t *)banshee->svga.vram;
+        banshee->voodoo->tex_mem[1] = banshee->svga.vram;
+        banshee->voodoo->tex_mem_w[1] = (uint16_t *)banshee->svga.vram;
+        banshee->voodoo->texture_mask = banshee->svga.vram_mask;
+        voodoo_generate_filter_v1(banshee->voodoo);
+
+        banshee->vidSerialParallelPort = VIDSERIAL_DDC_DCK_W | VIDSERIAL_DDC_DDA_W;
+
+        ddc_init();
+
+        switch (type)
+        {
+                case TYPE_BANSHEE:
+                if (has_sgram)
+                {
+                        banshee->pci_regs[0x2c] = 0x1a;
+                        banshee->pci_regs[0x2d] = 0x12;
+                        banshee->pci_regs[0x2e] = 0x04;
+                        banshee->pci_regs[0x2f] = 0x00;
+                }
+                else
+                {
+                        banshee->pci_regs[0x2c] = 0x02;
+                        banshee->pci_regs[0x2d] = 0x11;
+                        banshee->pci_regs[0x2e] = 0x17;
+                        banshee->pci_regs[0x2f] = 0x10;
+                }
+                break;
+
+                case TYPE_V3_2000:
+                banshee->pci_regs[0x2c] = 0x1a;
+                banshee->pci_regs[0x2d] = 0x12;
+                banshee->pci_regs[0x2e] = 0x30;
+                banshee->pci_regs[0x2f] = 0x00;
+                break;
+
+                case TYPE_V3_3000:
+                banshee->pci_regs[0x2c] = 0x1a;
+                banshee->pci_regs[0x2d] = 0x12;
+                banshee->pci_regs[0x2e] = 0x3a;
+                banshee->pci_regs[0x2f] = 0x00;
+                break;
+        }
+
+        return banshee;
+}
+
+static void *banshee_init()
+{
+        return banshee_init_common("pci_sg.rom", 1, TYPE_BANSHEE, VOODOO_BANSHEE);
+}
+static void *creative_banshee_init()
+{
+        return banshee_init_common("blasterpci.rom", 0, TYPE_BANSHEE, VOODOO_BANSHEE);
+}
+static void *v3_2000_init()
+{
+        return banshee_init_common("voodoo3_2000/2k11sd.rom", 0, TYPE_V3_2000, VOODOO_3);
+}
+static void *v3_3000_init()
+{
+        return banshee_init_common("voodoo3_3000/3k12sd.rom", 0, TYPE_V3_3000, VOODOO_3);
+}
+
+static int banshee_available()
+{
+        return rom_present("pci_sg.rom");
+}
+static int creative_banshee_available()
+{
+        return rom_present("blasterpci.rom");
+}
+static int v3_2000_available()
+{
+        return rom_present("voodoo3_2000/2k11sd.rom");
+}
+static int v3_3000_available()
+{
+        return rom_present("voodoo3_3000/3k12sd.rom");
+}
+
+static void banshee_close(void *p)
+{
+        banshee_t *banshee = (banshee_t *)p;
+
+        voodoo_card_close(banshee->voodoo);
+        svga_close(&banshee->svga);
+        
+        free(banshee);
+}
+
+static void banshee_speed_changed(void *p)
+{
+        banshee_t *banshee = (banshee_t *)p;
+        
+        svga_recalctimings(&banshee->svga);
+}
+
+static void banshee_force_redraw(void *p)
+{
+        banshee_t *banshee = (banshee_t *)p;
+
+        banshee->svga.fullchange = changeframecount;
+}
+
+static uint64_t status_time = 0;
+
+static void banshee_add_status_info(char *s, int max_len, void *p)
+{
+        banshee_t *banshee = (banshee_t *)p;
+        voodoo_t *voodoo = banshee->voodoo;
+        char temps[512];
+        int pixel_count_current[4];
+        int pixel_count_total;
+        int texel_count_current[4];
+        int texel_count_total;
+        int render_time[4];
+        uint64_t new_time = timer_read();
+        uint64_t status_diff = new_time - status_time;
+        int c;
+        status_time = new_time;
+
+        svga_add_status_info(s, max_len, &banshee->svga);
+
+
+        for (c = 0; c < 4; c++)
+        {
+                pixel_count_current[c] = voodoo->pixel_count[c];
+                texel_count_current[c] = voodoo->texel_count[c];
+                render_time[c] = voodoo->render_time[c];
+        }
+
+        pixel_count_total = (pixel_count_current[0] + pixel_count_current[1] + pixel_count_current[2] + pixel_count_current[3]) -
+                (voodoo->pixel_count_old[0] + voodoo->pixel_count_old[1] + voodoo->pixel_count_old[2] + voodoo->pixel_count_old[3]);
+        texel_count_total = (texel_count_current[0] + texel_count_current[1] + texel_count_current[2] + texel_count_current[3]) -
+                (voodoo->texel_count_old[0] + voodoo->texel_count_old[1] + voodoo->texel_count_old[2] + voodoo->texel_count_old[3]);
+        sprintf(temps, "%f Mpixels/sec (%f)\n%f Mtexels/sec (%f)\n%f ktris/sec\n%f%% CPU (%f%% real)\n%d frames/sec (%i)\n%f%% CPU (%f%% real)\n"/*%d reads/sec\n%d write/sec\n%d tex/sec\n*/,
+                (double)pixel_count_total/1000000.0,
+                ((double)pixel_count_total/1000000.0) / ((double)render_time[0] / status_diff),
+                (double)texel_count_total/1000000.0,
+                ((double)texel_count_total/1000000.0) / ((double)render_time[0] / status_diff),
+                (double)voodoo->tri_count/1000.0, ((double)voodoo->time * 100.0) / timer_freq, ((double)voodoo->time * 100.0) / status_diff, voodoo->frame_count, voodoo_recomp,
+                ((double)voodoo->render_time[0] * 100.0) / timer_freq, ((double)voodoo->render_time[0] * 100.0) / status_diff);
+        if (voodoo->render_threads >= 2)
+        {
+                char temps2[512];
+                sprintf(temps2, "%f%% CPU (%f%% real)\n",
+                        ((double)voodoo->render_time[1] * 100.0) / timer_freq, ((double)voodoo->render_time[1] * 100.0) / status_diff);
+                strncat(temps, temps2, sizeof(temps)-1);
+        }
+        if (voodoo->render_threads == 4)
+        {
+                char temps2[512];
+                sprintf(temps2, "%f%% CPU (%f%% real)\n%f%% CPU (%f%% real)\n",
+                        ((double)voodoo->render_time[2] * 100.0) / timer_freq, ((double)voodoo->render_time[2] * 100.0) / status_diff,
+                        ((double)voodoo->render_time[3] * 100.0) / timer_freq, ((double)voodoo->render_time[3] * 100.0) / status_diff);
+                strncat(temps, temps2, sizeof(temps)-1);
+        }
+
+        strncat(s, temps, max_len);
+
+        strncat(s, "Overlay mode: ", max_len); /* leilei debug additions */
+        if ((banshee->vidProcCfg & VIDPROCCFG_FILTER_MODE_MASK) == VIDPROCCFG_FILTER_MODE_DITHER_2X2)
+        strncat(s, "2x2 box filter\n", max_len);
+        if ((banshee->vidProcCfg & VIDPROCCFG_FILTER_MODE_MASK) == VIDPROCCFG_FILTER_MODE_DITHER_4X4)
+        strncat(s, "4x1 tap filter\n", max_len);
+        if ((banshee->vidProcCfg & VIDPROCCFG_FILTER_MODE_MASK) == VIDPROCCFG_FILTER_MODE_POINT)
+        strncat(s, "Nearest neighbor\n", max_len);
+        if ((banshee->vidProcCfg & VIDPROCCFG_FILTER_MODE_MASK) == VIDPROCCFG_FILTER_MODE_BILINEAR)
+        strncat(s, "Bilinear filtered\n", max_len);
+        if ((banshee->vidProcCfg & VIDPROCCFG_H_SCALE_ENABLE))
+        strncat(s, "H scaled \n", max_len);
+        if ((banshee->vidProcCfg & VIDPROCCFG_V_SCALE_ENABLE))
+        strncat(s, "V scaled \n", max_len);
+        if ((banshee->vidProcCfg & VIDPROCCFG_2X_MODE))
+        strncat(s, "2X mode\n", max_len);
+
+        strncat(s, "\n", max_len);
+
+        for (c = 0; c < 4; c++)
+        {
+                voodoo->pixel_count_old[c] = pixel_count_current[c];
+                voodoo->texel_count_old[c] = texel_count_current[c];
+                voodoo->render_time[c] = 0;
+        }
+
+        voodoo->tri_count = voodoo->frame_count = 0;
+        voodoo->rd_count = voodoo->wr_count = voodoo->tex_count = 0;
+        voodoo->time = 0;
+
+        voodoo->read_time = pci_nonburst_time + pci_burst_time;
+        
+        voodoo_recomp = 0;
+}
+
+device_t voodoo_banshee_device =
+{
+        "Voodoo Banshee PCI (reference)",
+        DEVICE_PCI,
+        banshee_init,
+        banshee_close,
+        banshee_available,
+        banshee_speed_changed,
+        banshee_force_redraw,
+        banshee_add_status_info,
+        banshee_sgram_config
+};
+
+device_t creative_voodoo_banshee_device =
+{
+        "Creative Labs 3D Blaster Banshee PCI",
+        DEVICE_PCI,
+        creative_banshee_init,
+        banshee_close,
+        creative_banshee_available,
+        banshee_speed_changed,
+        banshee_force_redraw,
+        banshee_add_status_info,
+        banshee_sdram_config
+};
+
+device_t voodoo_3_2000_device =
+{
+        "Voodoo 3 2000 PCI",
+        DEVICE_PCI,
+        v3_2000_init,
+        banshee_close,
+        v3_2000_available,
+        banshee_speed_changed,
+        banshee_force_redraw,
+        banshee_add_status_info,
+        banshee_sdram_config
+};
+
+device_t voodoo_3_3000_device =
+{
+        "Voodoo 3 3000 PCI",
+        DEVICE_PCI,
+        v3_3000_init,
+        banshee_close,
+        v3_3000_available,
+        banshee_speed_changed,
+        banshee_force_redraw,
+        banshee_add_status_info,
+        banshee_sdram_config
+};
diff --git a/pcem/vid_voodoo_banshee.h b/pcem/vid_voodoo_banshee.h
new file mode 100644 (file)
index 0000000..ddd7e3f
--- /dev/null
@@ -0,0 +1,6 @@
+extern device_t voodoo_banshee_device;
+extern device_t creative_voodoo_banshee_device;
+extern device_t voodoo_3_2000_device;
+extern device_t voodoo_3_3000_device;
+
+void banshee_set_overlay_addr(void *p, uint32_t addr);
diff --git a/pcem/vid_voodoo_banshee_blitter.cpp b/pcem/vid_voodoo_banshee_blitter.cpp
new file mode 100644 (file)
index 0000000..f6f0d03
--- /dev/null
@@ -0,0 +1,1450 @@
+/*Current issues :
+  - missing screen->screen scaled blits with format conversion
+  - missing YUV blits
+  - missing linestyle
+  - missing wait for vsync
+  - missing reversible lines
+
+  Notes :
+  - 16 bpp runs with tiled framebuffer - to aid 3D?
+    8 and 32 bpp use linear
+*/
+#include <math.h>
+#include <stddef.h>
+#include "ibm.h"
+#include "device.h"
+#include "mem.h"
+#include "thread.h"
+#include "video.h"
+#include "vid_svga.h"
+#include "vid_voodoo.h"
+#include "vid_voodoo_common.h"
+#include "vid_voodoo_banshee_blitter.h"
+#include "vid_voodoo_render.h"
+
+#define COMMAND_CMD_MASK                         (0xf)
+#define COMMAND_CMD_NOP                          (0 << 0)
+#define COMMAND_CMD_SCREEN_TO_SCREEN_BLT         (1 << 0)
+#define COMMAND_CMD_SCREEN_TO_SCREEN_STRETCH_BLT (2 << 0)
+#define COMMAND_CMD_HOST_TO_SCREEN_BLT           (3 << 0)
+#define COMMAND_CMD_HOST_TO_SCREEN_STRETCH_BLT   (4 << 0)
+#define COMMAND_CMD_RECTFILL                     (5 << 0)
+#define COMMAND_CMD_LINE                         (6 << 0)
+#define COMMAND_CMD_POLYLINE                     (7 << 0)
+#define COMMAND_CMD_POLYFILL                     (8 << 0)
+#define COMMAND_INITIATE        (1 << 8)
+#define COMMAND_INC_X_START     (1 << 10)
+#define COMMAND_INC_Y_START     (1 << 11)
+#define COMMAND_STIPPLE_LINE    (1 << 12)
+#define COMMAND_PATTERN_MONO    (1 << 13)
+#define COMMAND_DX              (1 << 14)
+#define COMMAND_DY              (1 << 15)
+#define COMMAND_TRANS_MONO      (1 << 16)
+#define COMMAND_PATOFF_X_MASK   (7 << 17)
+#define COMMAND_PATOFF_X_SHIFT  (17)
+#define COMMAND_PATOFF_Y_MASK   (7 << 20)
+#define COMMAND_PATOFF_Y_SHIFT  (20)
+#define COMMAND_CLIP_SEL        (1 << 23)
+
+#define CMDEXTRA_SRC_COLORKEY   (1 << 0)
+#define CMDEXTRA_DST_COLORKEY   (1 << 1)
+#define CMDEXTRA_FORCE_PAT_ROW0 (1 << 3)
+
+#define SRC_FORMAT_STRIDE_MASK (0x1fff)
+#define SRC_FORMAT_COL_MASK    (0xf << 16)
+#define SRC_FORMAT_COL_1_BPP   (0 << 16)
+#define SRC_FORMAT_COL_8_BPP   (1 << 16)
+#define SRC_FORMAT_COL_16_BPP  (3 << 16)
+#define SRC_FORMAT_COL_24_BPP  (4 << 16)
+#define SRC_FORMAT_COL_32_BPP  (5 << 16)
+#define SRC_FORMAT_COL_YUYV    (8 << 16)
+#define SRC_FORMAT_COL_UYVY    (9 << 16)
+#define SRC_FORMAT_BYTE_SWIZZLE   (1 << 20)
+#define SRC_FORMAT_WORD_SWIZZLE   (1 << 21)
+#define SRC_FORMAT_PACKING_MASK   (3 << 22)
+#define SRC_FORMAT_PACKING_STRIDE (0 << 22)
+#define SRC_FORMAT_PACKING_BYTE   (1 << 22)
+#define SRC_FORMAT_PACKING_WORD   (2 << 22)
+#define SRC_FORMAT_PACKING_DWORD  (3 << 22)
+
+#define DST_FORMAT_STRIDE_MASK (0x1fff)
+#define DST_FORMAT_COL_MASK    (0xf << 16)
+#define DST_FORMAT_COL_8_BPP   (1 << 16)
+#define DST_FORMAT_COL_16_BPP  (3 << 16)
+#define DST_FORMAT_COL_24_BPP  (4 << 16)
+#define DST_FORMAT_COL_32_BPP  (5 << 16)
+
+#define BRES_ERROR_MASK        (0xffff)
+#define BRES_ERROR_USE         (1 << 31)
+
+enum
+{
+        COLORKEY_8,
+        COLORKEY_16,
+        COLORKEY_32
+};
+
+static int colorkey(voodoo_t *voodoo, uint32_t src, int src_notdst, int color_format)
+{
+        uint32_t min = src_notdst ? voodoo->banshee_blt.srcColorkeyMin : voodoo->banshee_blt.dstColorkeyMin;
+        uint32_t max = src_notdst ? voodoo->banshee_blt.srcColorkeyMax : voodoo->banshee_blt.dstColorkeyMax;
+        
+        if (!(voodoo->banshee_blt.commandExtra & (src_notdst ? CMDEXTRA_SRC_COLORKEY : CMDEXTRA_DST_COLORKEY)))
+                return 0;
+                
+        switch (color_format)
+        {
+                case COLORKEY_8:
+                return ((src & 0xff) >= (min & 0xff)) && ((src & 0xff) <= (max & 0xff));
+                
+                case COLORKEY_16:
+                {
+                        int r = (src >> 11) & 0x1f, r_min = (min >> 11) & 0x1f, r_max = (max >> 11) & 0x1f;
+                        int g = (src >>  5) & 0x3f, g_min = (min >>  5) & 0x3f, g_max = (max >>  5) & 0x3f;
+                        int b = src & 0x1f, b_min = min & 0x1f, b_max = max & 0x1f;
+                        
+                        return (r >= r_min) && (r <= r_max) && (g >= g_min) && (g <= g_max) &&
+                                (b >= b_min) && (b <= b_max);
+                }
+
+                case COLORKEY_32:
+                {
+                        int r = (src >> 16) & 0xff, r_min = (min >> 16) & 0xff, r_max = (max >> 16) & 0xff;
+                        int g = (src >>  8) & 0xff, g_min = (min >>  8) & 0xff, g_max = (max >>  8) & 0xff;
+                        int b = src & 0xff, b_min = min & 0xff, b_max = max & 0xff;
+                
+                        return (r >= r_min) && (r <= r_max) && (g >= g_min) && (g <= g_max) &&
+                                (b >= b_min) && (b <= b_max);
+                }
+                
+                default:
+                return 0;
+        }
+}
+
+static uint32_t MIX(voodoo_t *voodoo, uint32_t dest, uint32_t src, uint32_t pattern, int colour_format_src, int colour_format_dest)
+{
+        int rop_nr = 0;
+        uint32_t result = 0;
+        uint32_t rop;
+        
+        if (colorkey(voodoo, src, 1, colour_format_src))
+                rop_nr |= 2;
+        if (colorkey(voodoo, dest, 0, colour_format_dest))
+                rop_nr |= 1;
+                
+        rop = voodoo->banshee_blt.rops[rop_nr];
+
+        if (rop & 0x01)
+                result |= (~pattern & ~src & ~dest);
+        if (rop & 0x02)
+                result |= (~pattern & ~src &  dest);
+        if (rop & 0x04)
+                result |= (~pattern &  src & ~dest);
+        if (rop & 0x08)
+                result |= (~pattern &  src &  dest);
+        if (rop & 0x10)
+                result |= ( pattern & ~src & ~dest);
+        if (rop & 0x20)
+                result |= ( pattern & ~src &  dest);
+        if (rop & 0x40)
+                result |= ( pattern &  src & ~dest);
+        if (rop & 0x80)
+                result |= ( pattern &  src &  dest);
+
+        return result;
+}
+
+static uint32_t get_addr(voodoo_t *voodoo, int x, int y, int src_notdst, uint32_t src_stride)
+{
+        uint32_t stride = src_notdst ? src_stride : voodoo->banshee_blt.dst_stride;
+        uint32_t base_addr = src_notdst ? voodoo->banshee_blt.srcBaseAddr : voodoo->banshee_blt.dstBaseAddr;
+        
+        if (src_notdst ? voodoo->banshee_blt.srcBaseAddr_tiled : voodoo->banshee_blt.dstBaseAddr_tiled)
+                return (base_addr + (x & 127) + ((x >> 7) * 128*32) + ((y & 31) * 128) + (y >> 5)*stride) & voodoo->fb_mask;
+        else
+                return (base_addr + x + y*stride) & voodoo->fb_mask;
+}
+
+static void PLOT(voodoo_t *voodoo, int x, int y, int pat_x, int pat_y, uint8_t pattern_mask, uint8_t rop, uint32_t src, int src_colorkey)
+{
+        switch (voodoo->banshee_blt.dstFormat & DST_FORMAT_COL_MASK)
+        {
+                case DST_FORMAT_COL_8_BPP:
+                {
+                        uint32_t addr = get_addr(voodoo, x, y, 0, 0);//(voodoo->banshee_blt.dstBaseAddr + x + y*voodoo->banshee_blt.dst_stride) & voodoo->fb_mask;
+                        uint32_t dest = voodoo->vram[addr];
+                        uint32_t pattern = (voodoo->banshee_blt.command & COMMAND_PATTERN_MONO) ?
+                                        ((pattern_mask & (1 << (7-(pat_x & 7)))) ? voodoo->banshee_blt.colorFore : voodoo->banshee_blt.colorBack) :
+                                        voodoo->banshee_blt.colorPattern8[(pat_x & 7) + (pat_y & 7)*8];
+
+                        voodoo->vram[addr] = MIX(voodoo, dest, src, pattern, src_colorkey, COLORKEY_8);
+                        voodoo->changedvram[addr >> 12] = changeframecount;
+                        break;
+                }
+                case DST_FORMAT_COL_16_BPP:
+                {
+                        uint32_t addr = get_addr(voodoo, x*2, y, 0, 0);//(voodoo->banshee_blt.dstBaseAddr + x*2 + y*voodoo->banshee_blt.dst_stride)  & voodoo->fb_mask;
+                        uint32_t dest = *(uint16_t *)&voodoo->vram[addr];
+                        uint32_t pattern = (voodoo->banshee_blt.command & COMMAND_PATTERN_MONO) ?
+                                        ((pattern_mask & (1 << (7-(pat_x & 7)))) ? voodoo->banshee_blt.colorFore : voodoo->banshee_blt.colorBack) :
+                                        voodoo->banshee_blt.colorPattern16[(pat_x & 7) + (pat_y & 7)*8];
+
+                        *(uint16_t *)&voodoo->vram[addr] = MIX(voodoo, dest, src, pattern, src_colorkey, COLORKEY_16);
+                        voodoo->changedvram[addr >> 12] = changeframecount;
+                        break;
+                }
+                case DST_FORMAT_COL_24_BPP:
+                {
+                        uint32_t addr = get_addr(voodoo, x*3, y, 0, 0);//(voodoo->banshee_blt.dstBaseAddr + x*3 + y*voodoo->banshee_blt.dst_stride)  & voodoo->fb_mask;
+                        uint32_t dest = *(uint32_t *)&voodoo->vram[addr];
+                        uint32_t pattern = (voodoo->banshee_blt.command & COMMAND_PATTERN_MONO) ?
+                                        ((pattern_mask & (1 << (7-(pat_x & 7)))) ? voodoo->banshee_blt.colorFore : voodoo->banshee_blt.colorBack) :
+                                        voodoo->banshee_blt.colorPattern24[(pat_x & 7) + (pat_y & 7)*8];
+
+                        *(uint32_t *)&voodoo->vram[addr] = (MIX(voodoo, dest, src, pattern, src_colorkey, COLORKEY_32) & 0xffffff) | (dest & 0xff000000);
+                        voodoo->changedvram[addr >> 12] = changeframecount;
+                        break;
+                }
+                case DST_FORMAT_COL_32_BPP:
+                {
+                        uint32_t addr = get_addr(voodoo, x*4, y, 0, 0);//(voodoo->banshee_blt.dstBaseAddr + x*4 + y*voodoo->banshee_blt.dst_stride) & voodoo->fb_mask;
+                        uint32_t dest = *(uint32_t *)&voodoo->vram[addr];
+                        uint32_t pattern = (voodoo->banshee_blt.command & COMMAND_PATTERN_MONO) ?
+                                        ((pattern_mask & (1 << (7-(pat_x & 7)))) ? voodoo->banshee_blt.colorFore : voodoo->banshee_blt.colorBack) :
+                                        voodoo->banshee_blt.colorPattern[(pat_x & 7) + (pat_y & 7)*8];
+
+                        *(uint32_t *)&voodoo->vram[addr] = MIX(voodoo, dest, src, pattern, src_colorkey, COLORKEY_32);
+                        voodoo->changedvram[addr >> 12] = changeframecount;
+                        break;
+                }
+        }
+}
+
+static void PLOT_LINE(voodoo_t *voodoo, int x, int y, uint8_t rop, uint32_t pattern, int src_colorkey)
+{
+        switch (voodoo->banshee_blt.dstFormat & DST_FORMAT_COL_MASK)
+        {
+                case DST_FORMAT_COL_8_BPP:
+                {
+                        uint32_t addr = get_addr(voodoo, x, y, 0, 0);//(voodoo->banshee_blt.dstBaseAddr + x + y*voodoo->banshee_blt.dst_stride) & voodoo->fb_mask;
+                        uint32_t dest = voodoo->vram[addr];
+
+                        voodoo->vram[addr] = MIX(voodoo, dest, voodoo->banshee_blt.colorFore, pattern, src_colorkey, COLORKEY_8);
+                        voodoo->changedvram[addr >> 12] = changeframecount;
+                        break;
+                }
+                case DST_FORMAT_COL_16_BPP:
+                {
+                        uint32_t addr = get_addr(voodoo, x*2, y, 0, 0);//(voodoo->banshee_blt.dstBaseAddr + x*2 + y*voodoo->banshee_blt.dst_stride)  & voodoo->fb_mask;
+                        uint32_t dest = *(uint16_t *)&voodoo->vram[addr];
+
+                        *(uint16_t *)&voodoo->vram[addr] = MIX(voodoo, dest, voodoo->banshee_blt.colorFore, pattern, src_colorkey, COLORKEY_16);
+                        voodoo->changedvram[addr >> 12] = changeframecount;
+                        break;
+                }
+                case DST_FORMAT_COL_24_BPP:
+                {
+                        uint32_t addr = get_addr(voodoo, x*3, y, 0, 0);//(voodoo->banshee_blt.dstBaseAddr + x*3 + y*voodoo->banshee_blt.dst_stride)  & voodoo->fb_mask;
+                        uint32_t dest = *(uint32_t *)&voodoo->vram[addr];
+
+                        *(uint32_t *)&voodoo->vram[addr] = (MIX(voodoo, dest, voodoo->banshee_blt.colorFore, pattern, src_colorkey, COLORKEY_32) & 0xffffff) | (dest & 0xff000000);
+                        voodoo->changedvram[addr >> 12] = changeframecount;
+                        break;
+                }
+                case DST_FORMAT_COL_32_BPP:
+                {
+                        uint32_t addr = get_addr(voodoo, x*4, y, 0, 0);//(voodoo->banshee_blt.dstBaseAddr + x*4 + y*voodoo->banshee_blt.dst_stride) & voodoo->fb_mask;
+                        uint32_t dest = *(uint32_t *)&voodoo->vram[addr];
+
+                        *(uint32_t *)&voodoo->vram[addr] = MIX(voodoo, dest, voodoo->banshee_blt.colorFore, pattern, src_colorkey, COLORKEY_32);
+                        voodoo->changedvram[addr >> 12] = changeframecount;
+                        break;
+                }
+        }
+}
+
+static void update_src_stride(voodoo_t *voodoo)
+{
+        int bpp;
+
+        switch (voodoo->banshee_blt.srcFormat & SRC_FORMAT_COL_MASK)
+        {
+                case SRC_FORMAT_COL_1_BPP:
+                bpp = 1;
+                break;
+                case SRC_FORMAT_COL_8_BPP:
+                bpp = 8;
+                break;
+                case SRC_FORMAT_COL_16_BPP:
+                bpp = 16;
+                break;
+                case SRC_FORMAT_COL_24_BPP:
+                bpp = 24;
+                break;
+                case SRC_FORMAT_COL_32_BPP:
+                bpp = 32;
+                break;
+
+                default:
+                bpp = 16;
+                break;
+        }
+
+        switch (voodoo->banshee_blt.srcFormat & SRC_FORMAT_PACKING_MASK)
+        {
+                case SRC_FORMAT_PACKING_STRIDE:
+                voodoo->banshee_blt.src_stride_src = voodoo->banshee_blt.src_stride; //voodoo->banshee_blt.srcFormat & SRC_FORMAT_STRIDE_MASK;
+                voodoo->banshee_blt.src_stride_dest = voodoo->banshee_blt.src_stride; //voodoo->banshee_blt.srcFormat & SRC_FORMAT_STRIDE_MASK;
+                voodoo->banshee_blt.host_data_size_src = (voodoo->banshee_blt.srcSizeX * bpp + 7) >> 3;
+                voodoo->banshee_blt.host_data_size_dest = (voodoo->banshee_blt.dstSizeX * bpp + 7) >> 3;
+//                pclog("Stride packing %08x %08x   bpp=%i dstSizeX=%i\n", voodoo->banshee_blt.src_stride_dest, voodoo->banshee_blt.host_data_size_dest, bpp, voodoo->banshee_blt.dstSizeX);
+                break;
+
+                case SRC_FORMAT_PACKING_BYTE:
+                voodoo->banshee_blt.src_stride_src = (voodoo->banshee_blt.srcSizeX * bpp + 7) >> 3;
+                voodoo->banshee_blt.src_stride_dest = (voodoo->banshee_blt.dstSizeX * bpp + 7) >> 3;
+                voodoo->banshee_blt.host_data_size_src = voodoo->banshee_blt.src_stride_src;
+                voodoo->banshee_blt.host_data_size_dest = voodoo->banshee_blt.src_stride_dest;
+//                pclog("Byte packing %08x %08x\n", voodoo->banshee_blt.src_stride_dest, voodoo->banshee_blt.host_data_size_dest);
+                break;
+
+                case SRC_FORMAT_PACKING_WORD:
+                voodoo->banshee_blt.src_stride_src = ((voodoo->banshee_blt.srcSizeX * bpp + 15) >> 4) * 2;
+                voodoo->banshee_blt.src_stride_dest = ((voodoo->banshee_blt.dstSizeX * bpp + 15) >> 4) * 2;
+                voodoo->banshee_blt.host_data_size_src = voodoo->banshee_blt.src_stride_src;
+                voodoo->banshee_blt.host_data_size_dest = voodoo->banshee_blt.src_stride_dest;
+//                pclog("Word packing %08x %08x\n", voodoo->banshee_blt.src_stride_dest, voodoo->banshee_blt.host_data_size_dest);
+                break;
+
+                case SRC_FORMAT_PACKING_DWORD:
+                voodoo->banshee_blt.src_stride_src = ((voodoo->banshee_blt.srcSizeX * bpp + 31) >> 5) * 4;
+                voodoo->banshee_blt.src_stride_dest = ((voodoo->banshee_blt.dstSizeX * bpp + 31) >> 5) * 4;
+                voodoo->banshee_blt.host_data_size_src = voodoo->banshee_blt.src_stride_src;
+                voodoo->banshee_blt.host_data_size_dest = voodoo->banshee_blt.src_stride_dest;
+//                pclog("Dword packing %08x %08x\n", voodoo->banshee_blt.src_stride_dest, voodoo->banshee_blt.host_data_size_dest);
+                break;
+        }
+}
+
+static void end_command(voodoo_t *voodoo)
+{
+        /*Update dest coordinates if required*/
+        if (voodoo->banshee_blt.command & COMMAND_INC_X_START)
+        {
+                voodoo->banshee_blt.dstXY &= ~0x0000ffff;
+                voodoo->banshee_blt.dstXY |= (voodoo->banshee_blt.dstX & 0xffff);
+        }
+
+        if (voodoo->banshee_blt.command & COMMAND_INC_Y_START)
+        {
+                voodoo->banshee_blt.dstXY &= ~0xffff0000;
+                voodoo->banshee_blt.dstXY |= (voodoo->banshee_blt.dstY << 16);
+        }
+}
+
+static void banshee_do_rectfill(voodoo_t *voodoo)
+{
+        clip_t *clip = &voodoo->banshee_blt.clip[(voodoo->banshee_blt.command & COMMAND_CLIP_SEL) ? 1 : 0];
+        int dst_y = voodoo->banshee_blt.dstY;
+        uint8_t *pattern_mono = (uint8_t *)voodoo->banshee_blt.colorPattern;
+        int pat_y = (voodoo->banshee_blt.commandExtra & CMDEXTRA_FORCE_PAT_ROW0) ? 0 : (voodoo->banshee_blt.patoff_y + voodoo->banshee_blt.dstY);
+        int use_pattern_trans = (voodoo->banshee_blt.command & (COMMAND_PATTERN_MONO | COMMAND_TRANS_MONO)) ==
+                                             (COMMAND_PATTERN_MONO | COMMAND_TRANS_MONO);
+        uint8_t rop = voodoo->banshee_blt.command >> 24;
+
+//        pclog("banshee_do_rectfill: size=%i,%i  dst=%i,%i\n", voodoo->banshee_blt.dstSizeX, voodoo->banshee_blt.dstSizeY, voodoo->banshee_blt.dstX, voodoo->banshee_blt.dstY);
+//        pclog("clipping: %i,%i -> %i,%i\n", clip->x_min, clip->y_min, clip->x_max, clip->y_max);
+//        pclog("colorFore=%08x\n", voodoo->banshee_blt.colorFore);
+        for (voodoo->banshee_blt.cur_y = 0; voodoo->banshee_blt.cur_y < voodoo->banshee_blt.dstSizeY; voodoo->banshee_blt.cur_y++)
+        {
+                int dst_x = voodoo->banshee_blt.dstX;
+
+                if (dst_y >= clip->y_min && dst_y < clip->y_max)
+                {
+                        int pat_x = voodoo->banshee_blt.patoff_x + voodoo->banshee_blt.dstX;
+                        uint8_t pattern_mask = pattern_mono[pat_y & 7];
+                        
+                        for (voodoo->banshee_blt.cur_x = 0; voodoo->banshee_blt.cur_x < voodoo->banshee_blt.dstSizeX; voodoo->banshee_blt.cur_x++)
+                        {
+                                int pattern_trans = use_pattern_trans ? (pattern_mask & (1 << (7-(pat_x & 7)))) : 1;
+
+                                if (dst_x >= clip->x_min && dst_x < clip->x_max && pattern_trans)
+                                        PLOT(voodoo, dst_x, dst_y, pat_x, pat_y, pattern_mask, rop, voodoo->banshee_blt.colorFore, COLORKEY_32);
+
+                                dst_x += (voodoo->banshee_blt.command & COMMAND_DX) ? -1 : 1;
+                                pat_x += (voodoo->banshee_blt.command & COMMAND_DX) ? -1 : 1;
+                        }
+                }
+                dst_y += (voodoo->banshee_blt.command & COMMAND_DY) ? -1 : 1;
+                if (!(voodoo->banshee_blt.commandExtra & CMDEXTRA_FORCE_PAT_ROW0))
+                        pat_y += (voodoo->banshee_blt.command & COMMAND_DY) ? -1 : 1;
+        }
+
+        end_command(voodoo);
+}
+
+static void do_screen_to_screen_line(voodoo_t *voodoo, uint8_t *src_p, int use_x_dir, int src_x, int src_tiled)
+{
+        clip_t *clip = &voodoo->banshee_blt.clip[(voodoo->banshee_blt.command & COMMAND_CLIP_SEL) ? 1 : 0];
+        int dst_y = voodoo->banshee_blt.dstY;
+        int pat_y = (voodoo->banshee_blt.commandExtra & CMDEXTRA_FORCE_PAT_ROW0) ? 0 : (voodoo->banshee_blt.patoff_y + voodoo->banshee_blt.dstY);
+        uint8_t *pattern_mono = (uint8_t *)voodoo->banshee_blt.colorPattern;
+        int use_pattern_trans = (voodoo->banshee_blt.command & (COMMAND_PATTERN_MONO | COMMAND_TRANS_MONO)) ==
+                                             (COMMAND_PATTERN_MONO | COMMAND_TRANS_MONO);
+        uint8_t rop = voodoo->banshee_blt.command >> 24;
+        int src_colorkey;
+        
+        switch (voodoo->banshee_blt.srcFormat & SRC_FORMAT_COL_MASK)
+        {
+                case SRC_FORMAT_COL_8_BPP:
+                src_colorkey = COLORKEY_8;
+                break;
+                case SRC_FORMAT_COL_16_BPP:
+                src_colorkey = COLORKEY_16;
+                break;
+                default:
+                src_colorkey = COLORKEY_32;
+                break;
+        }
+//        pclog("do_screen_to_screen_line: srcFormat=%08x dst=%08x\n", voodoo->banshee_blt.srcFormat, voodoo->banshee_blt.dstFormat);
+        if ((voodoo->banshee_blt.srcFormat & SRC_FORMAT_COL_MASK) ==
+                (voodoo->banshee_blt.dstFormat & DST_FORMAT_COL_MASK))
+        {
+                /*No conversion required*/
+                if (dst_y >= clip->y_min && dst_y < clip->y_max)
+                {
+                        int dst_x = voodoo->banshee_blt.dstX;
+                        int pat_x = voodoo->banshee_blt.patoff_x + voodoo->banshee_blt.dstX;
+                        uint8_t pattern_mask = pattern_mono[pat_y & 7];
+
+                        for (voodoo->banshee_blt.cur_x = 0; voodoo->banshee_blt.cur_x < voodoo->banshee_blt.dstSizeX; voodoo->banshee_blt.cur_x++)
+                        {
+                                int pattern_trans = use_pattern_trans ? (pattern_mask & (1 << (7-(pat_x & 7)))) : 1;
+                                int src_x_real = (src_x * voodoo->banshee_blt.src_bpp) >> 3;
+                                
+                                if (src_tiled)
+                                        src_x_real = (src_x_real & 127) + ((src_x_real >> 7) * 128*32);
+
+                                if (dst_x >= clip->x_min && dst_x < clip->x_max && pattern_trans)
+                                {
+                                        switch (voodoo->banshee_blt.dstFormat & DST_FORMAT_COL_MASK)
+                                        {
+                                                case DST_FORMAT_COL_8_BPP:
+                                                {
+                                                        uint32_t dst_addr = get_addr(voodoo, dst_x, dst_y, 0, 0);//(voodoo->banshee_blt.dstBaseAddr + dst_x + dst_y*voodoo->banshee_blt.dst_stride) & voodoo->fb_mask;
+                                                        uint32_t src = src_p[src_x_real];
+                                                        uint32_t dest = voodoo->vram[dst_addr];
+                                                        uint32_t pattern = (voodoo->banshee_blt.command & COMMAND_PATTERN_MONO) ?
+                                                                        ((pattern_mask & (1 << (7-(pat_x & 7)))) ? voodoo->banshee_blt.colorFore : voodoo->banshee_blt.colorBack) :
+                                                                        voodoo->banshee_blt.colorPattern8[(pat_x & 7) + (pat_y & 7)*8];
+
+                                                        voodoo->vram[dst_addr] = MIX(voodoo, dest, src, pattern, COLORKEY_8, COLORKEY_8);
+                                                        voodoo->changedvram[dst_addr >> 12] = changeframecount;
+                                                        break;
+                                                }
+                                                case DST_FORMAT_COL_16_BPP:
+                                                {
+                                                        uint32_t dst_addr = get_addr(voodoo, dst_x*2, dst_y, 0, 0);//dst_addr = (voodoo->banshee_blt.dstBaseAddr + dst_x*2 + dst_y*voodoo->banshee_blt.dst_stride) & voodoo->fb_mask;
+                                                        uint32_t src = *(uint16_t *)&src_p[src_x_real];
+                                                        uint32_t dest = *(uint16_t *)&voodoo->vram[dst_addr];
+                                                        uint32_t pattern = (voodoo->banshee_blt.command & COMMAND_PATTERN_MONO) ?
+                                                                        ((pattern_mask & (1 << (7-(pat_x & 7)))) ? voodoo->banshee_blt.colorFore : voodoo->banshee_blt.colorBack) :
+                                                                        voodoo->banshee_blt.colorPattern16[(pat_x & 7) + (pat_y & 7)*8];
+
+                                                        *(uint16_t *)&voodoo->vram[dst_addr] = MIX(voodoo, dest, src, pattern, COLORKEY_16, COLORKEY_16);
+                                                        voodoo->changedvram[dst_addr >> 12] = changeframecount;
+                                                        break;
+                                                }
+                                                case DST_FORMAT_COL_24_BPP:
+                                                {
+                                                        uint32_t dst_addr = get_addr(voodoo, dst_x*3, dst_y, 0, 0);//dst_addr = (voodoo->banshee_blt.dstBaseAddr + dst_x*3 + dst_y*voodoo->banshee_blt.dst_stride) & voodoo->fb_mask;
+                                                        uint32_t src = *(uint32_t *)&src_p[src_x_real];
+                                                        uint32_t dest = *(uint32_t *)&voodoo->vram[dst_addr];
+                                                        uint32_t pattern = (voodoo->banshee_blt.command & COMMAND_PATTERN_MONO) ?
+                                                                        ((pattern_mask & (1 << (7-(pat_x & 7)))) ? voodoo->banshee_blt.colorFore : voodoo->banshee_blt.colorBack) :
+                                                                        voodoo->banshee_blt.colorPattern24[(pat_x & 7) + (pat_y & 7)*8];
+
+                                                        *(uint32_t *)&voodoo->vram[dst_addr] = (MIX(voodoo, dest, src, pattern, COLORKEY_32, COLORKEY_32) & 0xffffff) | (dest & 0xff000000);
+                                                        voodoo->changedvram[dst_addr >> 12] = changeframecount;
+                                                        break;
+                                                }
+                                                case DST_FORMAT_COL_32_BPP:
+                                                {
+                                                        uint32_t dst_addr = get_addr(voodoo, dst_x*4, dst_y, 0, 0);//dst_addr = (voodoo->banshee_blt.dstBaseAddr + dst_x*4 + dst_y*voodoo->banshee_blt.dst_stride) & voodoo->fb_mask;
+                                                        uint32_t src = *(uint32_t *)&src_p[src_x_real];
+                                                        uint32_t dest = *(uint32_t *)&voodoo->vram[dst_addr];
+                                                        uint32_t pattern = (voodoo->banshee_blt.command & COMMAND_PATTERN_MONO) ?
+                                                                        ((pattern_mask & (1 << (7-(pat_x & 7)))) ? voodoo->banshee_blt.colorFore : voodoo->banshee_blt.colorBack) :
+                                                                        voodoo->banshee_blt.colorPattern[(pat_x & 7) + (pat_y & 7)*8];
+
+                                                        *(uint32_t *)&voodoo->vram[dst_addr] = MIX(voodoo, dest, src, pattern, COLORKEY_32, COLORKEY_32);
+                                                        voodoo->changedvram[dst_addr >> 12] = changeframecount;
+                                                        break;
+                                                }
+                                        }
+                                }
+                                if (use_x_dir)
+                                {
+                                        src_x += (voodoo->banshee_blt.command & COMMAND_DX) ? -1 : 1;
+                                        dst_x += (voodoo->banshee_blt.command & COMMAND_DX) ? -1 : 1;
+                                        pat_x += (voodoo->banshee_blt.command & COMMAND_DX) ? -1 : 1;
+                                }
+                                else
+                                {
+                                        src_x++;
+                                        dst_x++;
+                                        pat_x++;
+                                }
+                        }
+                }
+                voodoo->banshee_blt.srcY += (voodoo->banshee_blt.command & COMMAND_DY) ? -1 : 1;
+                voodoo->banshee_blt.dstY += (voodoo->banshee_blt.command & COMMAND_DY) ? -1 : 1;
+        }
+        else
+        {
+                /*Conversion required*/
+                if (dst_y >= clip->y_min && dst_y < clip->y_max)
+                {
+//                        int src_x = voodoo->banshee_blt.srcX;
+                        int dst_x = voodoo->banshee_blt.dstX;
+                        int pat_x = voodoo->banshee_blt.patoff_x + voodoo->banshee_blt.dstX;
+                        uint8_t pattern_mask = pattern_mono[pat_y & 7];
+
+                        for (voodoo->banshee_blt.cur_x = 0; voodoo->banshee_blt.cur_x < voodoo->banshee_blt.dstSizeX; voodoo->banshee_blt.cur_x++)
+                        {
+                                int pattern_trans = use_pattern_trans ? (pattern_mask & (1 << (7-(pat_x & 7)))) : 1;
+                                int src_x_real = (src_x * voodoo->banshee_blt.src_bpp) >> 3;
+
+                                if (src_tiled)
+                                        src_x_real = (src_x_real & 127) + ((src_x_real >> 7) * 128*32);
+
+                                if (dst_x >= clip->x_min && dst_x < clip->x_max && pattern_trans)
+                                {
+                                        uint32_t src_data = 0;
+                                        int transparent = 0;
+
+                                        switch (voodoo->banshee_blt.srcFormat & SRC_FORMAT_COL_MASK)
+                                        {
+                                                case SRC_FORMAT_COL_1_BPP:
+                                                {
+                                                        uint8_t src_byte = src_p[src_x_real];
+                                                        src_data = (src_byte & (0x80 >> (src_x & 7))) ? voodoo->banshee_blt.colorFore : voodoo->banshee_blt.colorBack;
+                                                        if (voodoo->banshee_blt.command & COMMAND_TRANS_MONO)
+                                                                transparent = !(src_byte & (0x80 >> (src_x & 7)));
+//                                                        pclog(" 1bpp src_byte=%02x src_x=%i src_data=%x transparent=%i\n", src_byte, src_x, src_data, transparent);
+                                                        break;
+                                                }
+                                                case SRC_FORMAT_COL_8_BPP:
+                                                {
+                                                        src_data = src_p[src_x_real];
+                                                        break;
+                                                }
+                                                case SRC_FORMAT_COL_16_BPP:
+                                                {
+                                                        uint16_t src_16 = *(uint16_t *)&src_p[src_x_real];
+                                                        int r = (src_16 >> 11);
+                                                        int g = (src_16 >> 5) & 0x3f;
+                                                        int b = src_16 & 0x1f;
+
+                                                        r = (r << 3) | (r >> 2);
+                                                        g = (g << 2) | (g >> 4);
+                                                        b = (b << 3) | (b >> 2);
+                                                        src_data = (r << 16) | (g << 8) | b;
+                                                        break;
+                                                }
+                                                case SRC_FORMAT_COL_24_BPP:
+                                                {
+                                                        src_data = *(uint32_t *)&src_p[src_x_real];
+                                                        break;
+                                                }
+                                                case SRC_FORMAT_COL_32_BPP:
+                                                {
+                                                        src_data = *(uint32_t *)&src_p[src_x_real];
+                                                        break;
+                                                }
+#ifndef RELEASE_BUILD
+                                                default:
+                                                fatal("banshee_do_screen_to_screen_blt: unknown srcFormat %08x\n", voodoo->banshee_blt.srcFormat);
+#endif
+                                        }
+
+                                        if ((voodoo->banshee_blt.dstFormat & DST_FORMAT_COL_MASK) == DST_FORMAT_COL_16_BPP &&
+                                            (voodoo->banshee_blt.srcFormat & SRC_FORMAT_COL_MASK) != SRC_FORMAT_COL_1_BPP)
+                                        {
+                                                int r = src_data >> 16;
+                                                int g = (src_data >> 8) & 0xff;
+                                                int b = src_data & 0xff;
+
+                                                src_data = (b >> 3) | ((g >> 2) << 5) | ((r >> 3) << 11);
+                                        }
+
+                                        if (!transparent)
+                                                PLOT(voodoo, dst_x, dst_y, pat_x, pat_y, pattern_mask, rop, src_data, src_colorkey);
+                                }
+                                if (use_x_dir)
+                                {
+                                        src_x += (voodoo->banshee_blt.command & COMMAND_DX) ? -1 : 1;
+                                        dst_x += (voodoo->banshee_blt.command & COMMAND_DX) ? -1 : 1;
+                                        pat_x += (voodoo->banshee_blt.command & COMMAND_DX) ? -1 : 1;
+                                }
+                                else
+                                {
+                                        src_x++;
+                                        dst_x++;
+                                        pat_x++;
+                                }
+                        }
+                }
+                voodoo->banshee_blt.srcY += (voodoo->banshee_blt.command & COMMAND_DY) ? -1 : 1;
+                voodoo->banshee_blt.dstY += (voodoo->banshee_blt.command & COMMAND_DY) ? -1 : 1;
+        }
+}
+
+static void banshee_do_screen_to_screen_blt(voodoo_t *voodoo)
+{
+//        pclog("screen_to_screen: %08x %08x %08x\n", voodoo->banshee_blt.srcFormat, voodoo->banshee_blt.src_stride, voodoo->banshee_blt.src_stride_dest);
+//                return;
+        for (voodoo->banshee_blt.cur_y = 0; voodoo->banshee_blt.cur_y < voodoo->banshee_blt.dstSizeY; voodoo->banshee_blt.cur_y++)
+        {
+                uint32_t src_addr = get_addr(voodoo, 0, voodoo->banshee_blt.srcY, 1, voodoo->banshee_blt.src_stride_dest);
+//                if ((voodoo->banshee_blt.srcFormat & SRC_FORMAT_COL_MASK) == SRC_FORMAT_COL_1_BPP)
+//                        pclog(" srcY=%i src_addr=%08x\n", voodoo->banshee_blt.srcY, src_addr);
+                do_screen_to_screen_line(voodoo, &voodoo->vram[src_addr], 1, voodoo->banshee_blt.srcX, voodoo->banshee_blt.srcBaseAddr_tiled);
+        }
+        end_command(voodoo);
+}
+
+static void banshee_do_host_to_screen_blt(voodoo_t *voodoo, int count, uint32_t data)
+{
+//        if (voodoo->banshee_blt.dstBaseAddr == 0xee5194)
+//                pclog("banshee_do_host_to_screen_blt: data=%08x host_data_count=%i src_stride_dest=%i host_data_size_dest=%i\n", data, voodoo->banshee_blt.host_data_count, voodoo->banshee_blt.src_stride_dest, voodoo->banshee_blt.host_data_size_dest);
+
+        if (voodoo->banshee_blt.srcFormat & SRC_FORMAT_BYTE_SWIZZLE)
+                data = (data >> 24) | ((data >> 8) & 0xff00) | ((data << 8) & 0xff0000) | (data << 24);
+        if (voodoo->banshee_blt.srcFormat & SRC_FORMAT_WORD_SWIZZLE)
+                data = (data >> 16) | (data << 16);
+
+        if ((voodoo->banshee_blt.srcFormat & SRC_FORMAT_PACKING_MASK) == SRC_FORMAT_PACKING_STRIDE)
+        {
+                int last_byte;
+
+                if ((voodoo->banshee_blt.srcFormat & SRC_FORMAT_COL_MASK) == SRC_FORMAT_COL_1_BPP)
+                        last_byte = ((voodoo->banshee_blt.srcX & 31) + voodoo->banshee_blt.dstSizeX + 7) >> 3;
+                else
+                        last_byte = (voodoo->banshee_blt.srcX & 3) + voodoo->banshee_blt.host_data_size_dest;
+
+                *(uint32_t *)&voodoo->banshee_blt.host_data[voodoo->banshee_blt.host_data_count] = data;
+                voodoo->banshee_blt.host_data_count += 4;
+                if (voodoo->banshee_blt.host_data_count >= last_byte)
+                {
+//                        pclog("  %i %i srcX=%i srcFormat=%08x\n", voodoo->banshee_blt.cur_y, voodoo->banshee_blt.dstSizeY, voodoo->banshee_blt.srcX);
+                        if (voodoo->banshee_blt.cur_y < voodoo->banshee_blt.dstSizeY)
+                        {
+                                if ((voodoo->banshee_blt.srcFormat & SRC_FORMAT_COL_MASK) == SRC_FORMAT_COL_1_BPP)
+                                        do_screen_to_screen_line(voodoo, &voodoo->banshee_blt.host_data[(voodoo->banshee_blt.srcX >> 3) & 3], 0, voodoo->banshee_blt.srcX & 7, 0);
+                                else
+                                        do_screen_to_screen_line(voodoo, &voodoo->banshee_blt.host_data[voodoo->banshee_blt.srcX & 3], 0, 0, 0);
+                                voodoo->banshee_blt.cur_y++;
+                                if (voodoo->banshee_blt.cur_y == voodoo->banshee_blt.dstSizeY)
+                                        end_command(voodoo);
+                        }
+                        
+                        if ((voodoo->banshee_blt.srcFormat & SRC_FORMAT_COL_MASK) == SRC_FORMAT_COL_1_BPP)
+                                voodoo->banshee_blt.srcX += (voodoo->banshee_blt.srcFormat & SRC_FORMAT_STRIDE_MASK) << 3;
+                        else
+                                voodoo->banshee_blt.srcX += (voodoo->banshee_blt.srcFormat & SRC_FORMAT_STRIDE_MASK);
+                        
+                        voodoo->banshee_blt.host_data_count = 0;
+                }
+        }
+        else
+        {
+                *(uint32_t *)&voodoo->banshee_blt.host_data[voodoo->banshee_blt.host_data_count] = data;
+                voodoo->banshee_blt.host_data_count += 4;
+                while (voodoo->banshee_blt.host_data_count >= voodoo->banshee_blt.src_stride_dest)
+                {
+                        voodoo->banshee_blt.host_data_count -= voodoo->banshee_blt.src_stride_dest;
+
+//                        pclog("  %i %i\n", voodoo->banshee_blt.cur_y, voodoo->banshee_blt.dstSizeY);
+                        if (voodoo->banshee_blt.cur_y < voodoo->banshee_blt.dstSizeY)
+                        {
+                                do_screen_to_screen_line(voodoo, voodoo->banshee_blt.host_data, 0, 0, 0);
+                                voodoo->banshee_blt.cur_y++;
+                                if (voodoo->banshee_blt.cur_y == voodoo->banshee_blt.dstSizeY)
+                                        end_command(voodoo);
+                        }
+
+                        if (voodoo->banshee_blt.host_data_count)
+                        {
+//                                pclog("  remaining=%i\n", voodoo->banshee_blt.host_data_count);
+                                *(uint32_t *)&voodoo->banshee_blt.host_data[0] = data >> (4-voodoo->banshee_blt.host_data_count)*8;
+                        }
+                }
+        }
+}
+
+static void do_screen_to_screen_stretch_line(voodoo_t *voodoo,uint8_t *src_p, int src_x, int *src_y)
+{
+        clip_t *clip = &voodoo->banshee_blt.clip[(voodoo->banshee_blt.command & COMMAND_CLIP_SEL) ? 1 : 0];
+//        int src_y = voodoo->banshee_blt.srcY;
+        int dst_y = voodoo->banshee_blt.dstY;
+        int pat_y = (voodoo->banshee_blt.commandExtra & CMDEXTRA_FORCE_PAT_ROW0) ? 0 : (voodoo->banshee_blt.patoff_y + voodoo->banshee_blt.dstY);
+        uint8_t *pattern_mono = (uint8_t *)voodoo->banshee_blt.colorPattern;
+        int use_pattern_trans = (voodoo->banshee_blt.command & (COMMAND_PATTERN_MONO | COMMAND_TRANS_MONO)) ==
+                                             (COMMAND_PATTERN_MONO | COMMAND_TRANS_MONO);
+        uint32_t *colorPattern = voodoo->banshee_blt.colorPattern;
+
+        //int error_y = voodoo->banshee_blt.dstSizeY / 2;
+        
+/*        pclog("banshee_do_screen_to_screen_stretch_blt:\n");
+        pclog("  srcXY=%i,%i srcsizeXY=%i,%i\n", voodoo->banshee_blt.srcX, voodoo->banshee_blt.srcY, voodoo->banshee_blt.srcSizeX, voodoo->banshee_blt.srcSizeY);
+        pclog("  dstXY=%i,%i dstsizeXY=%i,%i\n", voodoo->banshee_blt.dstX, voodoo->banshee_blt.dstY, voodoo->banshee_blt.dstSizeX, voodoo->banshee_blt.dstSizeY);*/
+        if (dst_y >= clip->y_min && dst_y < clip->y_max)
+        {
+//                int src_x = voodoo->banshee_blt.srcX;
+                int dst_x = voodoo->banshee_blt.dstX;
+                int pat_x = voodoo->banshee_blt.patoff_x + voodoo->banshee_blt.dstX;
+                uint8_t pattern_mask = pattern_mono[pat_y & 7];
+                int error_x = voodoo->banshee_blt.dstSizeX / 2;
+        
+//                pclog(" Plot dest line %03i : src line %03i\n", dst_y, src_y);
+                for (voodoo->banshee_blt.cur_x = 0; voodoo->banshee_blt.cur_x < voodoo->banshee_blt.dstSizeX; voodoo->banshee_blt.cur_x++)
+                {
+                        int pattern_trans = use_pattern_trans ? (pattern_mask & (1 << (7-(pat_x & 7)))) : 1;
+
+                        if (dst_x >= clip->x_min && dst_x < clip->x_max && pattern_trans)
+                        {
+                                switch (voodoo->banshee_blt.dstFormat & DST_FORMAT_COL_MASK)
+                                {
+                                        case DST_FORMAT_COL_8_BPP:
+                                        {
+                                                uint32_t dst_addr = get_addr(voodoo, dst_x, dst_y, 0, 0);//(voodoo->banshee_blt.dstBaseAddr + dst_x + dst_y*voodoo->banshee_blt.dst_stride) & voodoo->fb_mask;
+                                                uint32_t src = src_p[src_x];
+                                                uint32_t dest = voodoo->vram[dst_addr];
+                                                uint32_t pattern = (voodoo->banshee_blt.command & COMMAND_PATTERN_MONO) ?
+                                                                ((pattern_mask & (1 << (7-(pat_x & 7)))) ? voodoo->banshee_blt.colorFore : voodoo->banshee_blt.colorBack) :
+                                                                colorPattern[(pat_x & 7) + (pat_y & 7)*8];
+
+                                                voodoo->vram[dst_addr] = MIX(voodoo, dest, src, pattern, COLORKEY_8, COLORKEY_8);
+//                                                pclog("%i,%i : sdp=%02x,%02x,%02x res=%02x\n", voodoo->banshee_blt.cur_x, voodoo->banshee_blt.cur_y, src, dest, pattern, voodoo->vram[dst_addr]);
+                                                voodoo->changedvram[dst_addr >> 12] = changeframecount;
+                                                break;
+                                        }
+                                        case DST_FORMAT_COL_16_BPP:
+                                        {
+                                                uint32_t dst_addr = get_addr(voodoo, dst_x*2, dst_y, 0, 0);//(voodoo->banshee_blt.dstBaseAddr + dst_x*2 + dst_y*voodoo->banshee_blt.dst_stride) & voodoo->fb_mask;
+                                                uint32_t src = *(uint16_t *)&src_p[src_x*2];
+                                                uint32_t dest = *(uint16_t *)&voodoo->vram[dst_addr];
+                                                uint32_t pattern = (voodoo->banshee_blt.command & COMMAND_PATTERN_MONO) ?
+                                                                ((pattern_mask & (1 << (7-(pat_x & 7)))) ? voodoo->banshee_blt.colorFore : voodoo->banshee_blt.colorBack) :
+                                                                colorPattern[(pat_x & 7) + (pat_y & 7)*8];
+
+                                                *(uint16_t *)&voodoo->vram[dst_addr] = MIX(voodoo, dest, src, pattern, COLORKEY_16, COLORKEY_16);
+//                                                pclog("%i,%i : sdp=%02x,%02x,%02x res=%02x\n", voodoo->banshee_blt.cur_x, voodoo->banshee_blt.cur_y, src, dest, pattern, *(uint16_t *)&voodoo->vram[dst_addr]);
+                                                voodoo->changedvram[dst_addr >> 12] = changeframecount;
+                                                break;
+                                        }
+                                        case DST_FORMAT_COL_24_BPP:
+                                        {
+                                                uint32_t dst_addr = get_addr(voodoo, dst_x*3, dst_y, 0, 0);//(voodoo->banshee_blt.dstBaseAddr + dst_x*3 + dst_y*voodoo->banshee_blt.dst_stride) & voodoo->fb_mask;
+                                                uint32_t src = *(uint32_t *)&src_p[src_x*3];
+                                                uint32_t dest = *(uint32_t *)&voodoo->vram[dst_addr];
+                                                uint32_t pattern = (voodoo->banshee_blt.command & COMMAND_PATTERN_MONO) ?
+                                                                ((pattern_mask & (1 << (7-(pat_x & 7)))) ? voodoo->banshee_blt.colorFore : voodoo->banshee_blt.colorBack) :
+                                                                colorPattern[(pat_x & 7) + (pat_y & 7)*8];
+
+                                                *(uint32_t *)&voodoo->vram[dst_addr] = (MIX(voodoo, dest, src, pattern, COLORKEY_32, COLORKEY_32) & 0xffffff) | (*(uint32_t *)&voodoo->vram[dst_addr] & 0xff000000);
+//                                                pclog("%i,%i : sdp=%02x,%02x,%02x res=%02x\n", voodoo->banshee_blt.cur_x, voodoo->banshee_blt.cur_y, src, dest, pattern, voodoo->vram[dst_addr]);
+                                                voodoo->changedvram[dst_addr >> 12] = changeframecount;
+                                                break;
+                                        }
+                                        case DST_FORMAT_COL_32_BPP:
+                                        {
+                                                uint32_t dst_addr = get_addr(voodoo, dst_x*4, dst_y, 0, 0);//(voodoo->banshee_blt.dstBaseAddr + dst_x*4 + dst_y*voodoo->banshee_blt.dst_stride) & voodoo->fb_mask;
+                                                uint32_t src = *(uint32_t *)&src_p[src_x*4];
+                                                uint32_t dest = *(uint32_t *)&voodoo->vram[dst_addr];
+                                                uint32_t pattern = (voodoo->banshee_blt.command & COMMAND_PATTERN_MONO) ?
+                                                                ((pattern_mask & (1 << (7-(pat_x & 7)))) ? voodoo->banshee_blt.colorFore : voodoo->banshee_blt.colorBack) :
+                                                                colorPattern[(pat_x & 7) + (pat_y & 7)*8];
+
+                                                *(uint32_t *)&voodoo->vram[dst_addr] = MIX(voodoo, dest, src, pattern, COLORKEY_32, COLORKEY_32);
+//                                                pclog("%i,%i : sdp=%02x,%02x,%02x res=%02x\n", voodoo->banshee_blt.cur_x, voodoo->banshee_blt.cur_y, src, dest, pattern, voodoo->vram[dst_addr]);
+                                                voodoo->changedvram[dst_addr >> 12] = changeframecount;
+                                                break;
+                                        }
+                                }
+                        }
+
+                        error_x -= voodoo->banshee_blt.srcSizeX;
+                        while (error_x < 0)
+                        {
+                                error_x += voodoo->banshee_blt.dstSizeX;
+                                src_x++;
+                        }
+                        dst_x++;
+                        pat_x++;
+                }
+        }
+                
+        voodoo->banshee_blt.bres_error_0 -= voodoo->banshee_blt.srcSizeY;
+        while (voodoo->banshee_blt.bres_error_0 < 0)
+        {
+                voodoo->banshee_blt.bres_error_0 += voodoo->banshee_blt.dstSizeY;
+                if (src_y)
+                        (*src_y) += (voodoo->banshee_blt.command & COMMAND_DY) ? -1 : 1;
+        }
+        voodoo->banshee_blt.dstY += (voodoo->banshee_blt.command & COMMAND_DY) ? -1 : 1;
+//        pat_y += (voodoo->banshee_blt.command & COMMAND_DY) ? -1 : 1;
+}
+
+static void banshee_do_screen_to_screen_stretch_blt(voodoo_t *voodoo)
+{
+//        pclog("screen_to_screen: %08x %08x %08x\n", voodoo->banshee_blt.srcFormat, voodoo->banshee_blt.src_stride, voodoo->banshee_blt.src_stride_dest);
+//                return;
+        for (voodoo->banshee_blt.cur_y = 0; voodoo->banshee_blt.cur_y < voodoo->banshee_blt.dstSizeY; voodoo->banshee_blt.cur_y++)
+        {
+                uint32_t src_addr = get_addr(voodoo, 0, voodoo->banshee_blt.srcY, 1, voodoo->banshee_blt.src_stride_src);//(voodoo->banshee_blt.srcBaseAddr + voodoo->banshee_blt.srcY*voodoo->banshee_blt.src_stride_src) & voodoo->fb_mask;
+//                pclog("scale_blit %i %08x  %08x\n", voodoo->banshee_blt.cur_y, src_addr, voodoo->banshee_blt.command);
+//                if ((voodoo->banshee_blt.srcFormat & SRC_FORMAT_COL_MASK) == SRC_FORMAT_COL_1_BPP)
+//                        pclog(" srcY=%i src_addr=%08x\n", voodoo->banshee_blt.srcY, src_addr);
+                do_screen_to_screen_stretch_line(voodoo, &voodoo->vram[src_addr], voodoo->banshee_blt.srcX, &voodoo->banshee_blt.srcY);
+        }
+        end_command(voodoo);
+}
+
+static void banshee_do_host_to_screen_stretch_blt(voodoo_t *voodoo, int count, uint32_t data)
+{
+//        if (voodoo->banshee_blt.dstBaseAddr == 0xee5194)
+//                pclog("banshee_do_host_to_screen_blt: data=%08x host_data_count=%i src_stride_dest=%i host_data_size_dest=%i\n", data, voodoo->banshee_blt.host_data_count, voodoo->banshee_blt.src_stride_dest, voodoo->banshee_blt.host_data_size_dest);
+
+        if (voodoo->banshee_blt.srcFormat & SRC_FORMAT_BYTE_SWIZZLE)
+                data = (data >> 24) | ((data >> 8) & 0xff00) | ((data << 8) & 0xff0000) | (data << 24);
+        if (voodoo->banshee_blt.srcFormat & SRC_FORMAT_WORD_SWIZZLE)
+                data = (data >> 16) | (data << 16);
+
+        if ((voodoo->banshee_blt.srcFormat & SRC_FORMAT_PACKING_MASK) == SRC_FORMAT_PACKING_STRIDE)
+        {
+                int last_byte = (voodoo->banshee_blt.srcX & 3) + voodoo->banshee_blt.host_data_size_src;
+
+                *(uint32_t *)&voodoo->banshee_blt.host_data[voodoo->banshee_blt.host_data_count] = data;
+                voodoo->banshee_blt.host_data_count += 4;
+                if (voodoo->banshee_blt.host_data_count >= last_byte)
+                {
+//                        pclog("  %i %i srcX=%i srcFormat=%08x\n", voodoo->banshee_blt.cur_y, voodoo->banshee_blt.dstSizeY, voodoo->banshee_blt.srcX);
+                        if (voodoo->banshee_blt.cur_y < voodoo->banshee_blt.dstSizeY)
+                        {
+                                if ((voodoo->banshee_blt.srcFormat & SRC_FORMAT_COL_MASK) == SRC_FORMAT_COL_1_BPP)
+                                        do_screen_to_screen_stretch_line(voodoo, &voodoo->banshee_blt.host_data[(voodoo->banshee_blt.srcX >> 3) & 3], voodoo->banshee_blt.srcX & 7, NULL);
+                                else
+                                        do_screen_to_screen_stretch_line(voodoo, &voodoo->banshee_blt.host_data[voodoo->banshee_blt.srcX & 3], 0, NULL);
+                                voodoo->banshee_blt.cur_y++;
+                                if (voodoo->banshee_blt.cur_y == voodoo->banshee_blt.dstSizeY)
+                                        end_command(voodoo);
+                        }
+
+                        if ((voodoo->banshee_blt.srcFormat & SRC_FORMAT_COL_MASK) == SRC_FORMAT_COL_1_BPP)
+                                voodoo->banshee_blt.srcX += (voodoo->banshee_blt.srcFormat & SRC_FORMAT_STRIDE_MASK) << 3;
+                        else
+                                voodoo->banshee_blt.srcX += (voodoo->banshee_blt.srcFormat & SRC_FORMAT_STRIDE_MASK);
+
+                        voodoo->banshee_blt.host_data_count = 0;
+                }
+        }
+        else
+        {
+                *(uint32_t *)&voodoo->banshee_blt.host_data[voodoo->banshee_blt.host_data_count] = data;
+                voodoo->banshee_blt.host_data_count += 4;
+                while (voodoo->banshee_blt.host_data_count >= voodoo->banshee_blt.src_stride_src)
+                {
+                        voodoo->banshee_blt.host_data_count -= voodoo->banshee_blt.src_stride_src;
+
+//                        pclog("  %i %i\n", voodoo->banshee_blt.cur_y, voodoo->banshee_blt.dstSizeY);
+                        if (voodoo->banshee_blt.cur_y < voodoo->banshee_blt.dstSizeY)
+                        {
+                                do_screen_to_screen_stretch_line(voodoo, voodoo->banshee_blt.host_data, 0, NULL);
+                                voodoo->banshee_blt.cur_y++;
+                                if (voodoo->banshee_blt.cur_y == voodoo->banshee_blt.dstSizeY)
+                                        end_command(voodoo);
+                        }
+
+                        if (voodoo->banshee_blt.host_data_count)
+                        {
+//                                pclog("  remaining=%i\n", voodoo->banshee_blt.host_data_count);
+                                *(uint32_t *)&voodoo->banshee_blt.host_data[0] = data >> (4-voodoo->banshee_blt.host_data_count)*8;
+                        }
+                }
+        }
+}
+
+static void step_line(voodoo_t *voodoo)
+{
+        if (voodoo->banshee_blt.line_pix_pos == voodoo->banshee_blt.line_rep_cnt)
+        {
+                voodoo->banshee_blt.line_pix_pos = 0;
+                if (voodoo->banshee_blt.line_bit_pos == voodoo->banshee_blt.line_bit_mask_size)
+                        voodoo->banshee_blt.line_bit_pos = 0;
+                else
+                        voodoo->banshee_blt.line_bit_pos++;
+        }
+        else
+                voodoo->banshee_blt.line_pix_pos++;
+}
+
+static void banshee_do_line(voodoo_t *voodoo, int draw_last_pixel)
+{
+        clip_t *clip = &voodoo->banshee_blt.clip[(voodoo->banshee_blt.command & COMMAND_CLIP_SEL) ? 1 : 0];
+        uint8_t rop = voodoo->banshee_blt.command >> 24;
+        int dx = ABS(voodoo->banshee_blt.dstX - voodoo->banshee_blt.srcX);
+        int dy = ABS(voodoo->banshee_blt.dstY - voodoo->banshee_blt.srcY);
+        int x_inc = (voodoo->banshee_blt.dstX > voodoo->banshee_blt.srcX) ? 1 : -1;
+        int y_inc = (voodoo->banshee_blt.dstY > voodoo->banshee_blt.srcY) ? 1 : -1;
+        int x = voodoo->banshee_blt.srcX;
+        int y = voodoo->banshee_blt.srcY;
+        int error;
+        uint32_t stipple = (voodoo->banshee_blt.command & COMMAND_STIPPLE_LINE) ?
+                        voodoo->banshee_blt.lineStipple : ~0;
+
+        if (dx > dy) /*X major*/
+        {
+                error = dx/2;
+                while (x != voodoo->banshee_blt.dstX)
+                {
+                        int mask = stipple & (1 << voodoo->banshee_blt.line_bit_pos);
+                        int pattern_trans = (voodoo->banshee_blt.command & COMMAND_TRANS_MONO) ? mask : 1;
+                                
+                        if (y >= clip->y_min && y < clip->y_max && x >= clip->x_min && x < clip->x_max && pattern_trans)
+                                PLOT_LINE(voodoo, x, y, rop, mask ? voodoo->banshee_blt.colorFore : voodoo->banshee_blt.colorBack, COLORKEY_32);
+
+                        error -= dy;
+                        if (error < 0)
+                        {
+                                error += dx;
+                                y += y_inc;
+                        }
+                        x += x_inc;
+                        step_line(voodoo);
+                }
+        }
+        else         /*Y major*/
+        {
+                error = dy/2;
+                while (y != voodoo->banshee_blt.dstY)
+                {
+                        int mask = stipple & (1 << voodoo->banshee_blt.line_bit_pos);
+                        int pattern_trans = (voodoo->banshee_blt.command & COMMAND_TRANS_MONO) ? mask : 1;
+
+                        if (y >= clip->y_min && y < clip->y_max && x >= clip->x_min && x < clip->x_max && pattern_trans)
+                                PLOT_LINE(voodoo, x, y, rop, mask ? voodoo->banshee_blt.colorFore : voodoo->banshee_blt.colorBack, COLORKEY_32);
+
+                        error -= dx;
+                        if (error < 0)
+                        {
+                                error += dy;
+                                x += x_inc;
+                        }
+                        y += y_inc;
+                        step_line(voodoo);
+                }
+        }
+
+        if (draw_last_pixel)
+        {
+                int mask = stipple & (1 << voodoo->banshee_blt.line_bit_pos);
+                int pattern_trans = (voodoo->banshee_blt.command & COMMAND_TRANS_MONO) ? mask : 1;
+
+                if (y >= clip->y_min && y < clip->y_max && x >= clip->x_min && x < clip->x_max && pattern_trans)
+                        PLOT_LINE(voodoo, x, y, rop, mask ? voodoo->banshee_blt.colorFore : voodoo->banshee_blt.colorBack, COLORKEY_32);
+        }
+
+        voodoo->banshee_blt.srcXY = (x & 0xffff) | (y << 16);
+        voodoo->banshee_blt.srcX = x;
+        voodoo->banshee_blt.srcY = y;
+}
+
+static void banshee_polyfill_start(voodoo_t *voodoo)
+{
+        voodoo->banshee_blt.lx[0] = voodoo->banshee_blt.srcX;
+        voodoo->banshee_blt.ly[0] = voodoo->banshee_blt.srcY;
+        voodoo->banshee_blt.rx[0] = voodoo->banshee_blt.dstX;
+        voodoo->banshee_blt.ry[0] = voodoo->banshee_blt.dstY;
+        voodoo->banshee_blt.lx[1] = voodoo->banshee_blt.srcX;
+        voodoo->banshee_blt.ly[1] = voodoo->banshee_blt.srcY;
+        voodoo->banshee_blt.rx[1] = voodoo->banshee_blt.dstX;
+        voodoo->banshee_blt.ry[1] = voodoo->banshee_blt.dstY;
+        voodoo->banshee_blt.lx_cur = voodoo->banshee_blt.srcX;
+        voodoo->banshee_blt.rx_cur = voodoo->banshee_blt.dstX;
+}
+
+static void banshee_polyfill_continue(voodoo_t *voodoo, uint32_t data)
+{
+        clip_t *clip = &voodoo->banshee_blt.clip[(voodoo->banshee_blt.command & COMMAND_CLIP_SEL) ? 1 : 0];
+        uint8_t *pattern_mono = (uint8_t *)voodoo->banshee_blt.colorPattern;
+        int use_pattern_trans = (voodoo->banshee_blt.command & (COMMAND_PATTERN_MONO | COMMAND_TRANS_MONO)) ==
+                                             (COMMAND_PATTERN_MONO | COMMAND_TRANS_MONO);
+        uint8_t rop = voodoo->banshee_blt.command >> 24;
+        int y = MAX(voodoo->banshee_blt.ly[0], voodoo->banshee_blt.ry[0]);
+        int y_end;
+        
+//        pclog("Polyfill : data %08x\n", data);
+
+        /*if r1.y>=l1.y, next vertex is left*/
+        if (voodoo->banshee_blt.ry[1] >= voodoo->banshee_blt.ly[1])
+        {
+                voodoo->banshee_blt.lx[1] = ((int32_t)(data << 19)) >> 19;
+                voodoo->banshee_blt.ly[1] = ((int32_t)(data << 3)) >> 19;
+                voodoo->banshee_blt.dx[0] = ABS(voodoo->banshee_blt.lx[1] - voodoo->banshee_blt.lx[0]);
+                voodoo->banshee_blt.dy[0] = ABS(voodoo->banshee_blt.ly[1] - voodoo->banshee_blt.ly[0]);
+                voodoo->banshee_blt.x_inc[0] = (voodoo->banshee_blt.lx[1] > voodoo->banshee_blt.lx[0]) ? 1 : -1;
+                voodoo->banshee_blt.error[0] = voodoo->banshee_blt.dy[0] / 2;
+        }
+        else
+        {
+                voodoo->banshee_blt.rx[1] = ((int32_t)(data << 19)) >> 19;
+                voodoo->banshee_blt.ry[1] = ((int32_t)(data << 3)) >> 19;
+                voodoo->banshee_blt.dx[1] = ABS(voodoo->banshee_blt.rx[1] - voodoo->banshee_blt.rx[0]);
+                voodoo->banshee_blt.dy[1] = ABS(voodoo->banshee_blt.ry[1] - voodoo->banshee_blt.ry[0]);
+                voodoo->banshee_blt.x_inc[1] = (voodoo->banshee_blt.rx[1] > voodoo->banshee_blt.rx[0]) ? 1 : -1;
+                voodoo->banshee_blt.error[1] = voodoo->banshee_blt.dy[1] / 2;
+        }
+        
+/*        pclog("   verts now : %03i,%03i    %03i,%03i\n", voodoo->banshee_blt.lx[0], voodoo->banshee_blt.ly[0], voodoo->banshee_blt.rx[0], voodoo->banshee_blt.ry[0]);
+        pclog("               %03i,%03i    %03i,%03i\n", voodoo->banshee_blt.lx[1], voodoo->banshee_blt.ly[1], voodoo->banshee_blt.rx[1], voodoo->banshee_blt.ry[1]);
+        pclog("        left  dx=%i dy=%i x_inc=%i error=%i\n", voodoo->banshee_blt.dx[0],voodoo->banshee_blt.dy[0],voodoo->banshee_blt.x_inc[0],voodoo->banshee_blt.error[0]);
+        pclog("        right dx=%i dy=%i x_inc=%i error=%i\n", voodoo->banshee_blt.dx[1],voodoo->banshee_blt.dy[1],voodoo->banshee_blt.x_inc[1],voodoo->banshee_blt.error[1]);*/
+        y_end = MIN(voodoo->banshee_blt.ly[1], voodoo->banshee_blt.ry[1]);
+//        pclog("Polyfill : draw spans from %i-%i\n", y, y_end);
+        for (; y < y_end; y++)
+        {
+//                pclog("   %i:  %i %i\n", y, voodoo->banshee_blt.lx_cur, voodoo->banshee_blt.rx_cur);
+                /*Draw span from lx_cur to rx_cur*/
+                if (y >= clip->y_min && y < clip->y_max)
+                {
+                        int pat_y = (voodoo->banshee_blt.commandExtra & CMDEXTRA_FORCE_PAT_ROW0) ? 0 : (voodoo->banshee_blt.patoff_y + y);
+                        uint8_t pattern_mask = pattern_mono[pat_y & 7];
+                        int x;
+                        
+                        for (x = voodoo->banshee_blt.lx_cur; x < voodoo->banshee_blt.rx_cur; x++)
+                        {
+                                int pat_x = voodoo->banshee_blt.patoff_x + x;
+                                int pattern_trans = use_pattern_trans ? (pattern_mask & (1 << (7-(pat_x & 7)))) : 1;
+
+                                if (x >= clip->x_min && x < clip->x_max && pattern_trans)
+                                        PLOT(voodoo, x, y, pat_x, pat_y, pattern_mask, rop, voodoo->banshee_blt.colorFore, COLORKEY_32);
+                        }
+                }
+                
+                voodoo->banshee_blt.error[0] -= voodoo->banshee_blt.dx[0];
+                while (voodoo->banshee_blt.error[0] < 0)
+                {
+                        voodoo->banshee_blt.error[0] += voodoo->banshee_blt.dy[0];
+                        voodoo->banshee_blt.lx_cur += voodoo->banshee_blt.x_inc[0];
+                }
+                voodoo->banshee_blt.error[1] -= voodoo->banshee_blt.dx[1];
+                while (voodoo->banshee_blt.error[1] < 0)
+                {
+                        voodoo->banshee_blt.error[1] += voodoo->banshee_blt.dy[1];
+                        voodoo->banshee_blt.rx_cur += voodoo->banshee_blt.x_inc[1];
+                }
+        }
+
+        if (voodoo->banshee_blt.ry[1] == voodoo->banshee_blt.ly[1])
+        {
+                voodoo->banshee_blt.lx[0] = voodoo->banshee_blt.lx[1];
+                voodoo->banshee_blt.ly[0] = voodoo->banshee_blt.ly[1];
+                voodoo->banshee_blt.rx[0] = voodoo->banshee_blt.rx[1];
+                voodoo->banshee_blt.ry[0] = voodoo->banshee_blt.ry[1];
+        }
+        else if (voodoo->banshee_blt.ry[1] >= voodoo->banshee_blt.ly[1])
+        {
+                voodoo->banshee_blt.lx[0] = voodoo->banshee_blt.lx[1];
+                voodoo->banshee_blt.ly[0] = voodoo->banshee_blt.ly[1];
+        }
+        else
+        {
+                voodoo->banshee_blt.rx[0] = voodoo->banshee_blt.rx[1];
+                voodoo->banshee_blt.ry[0] = voodoo->banshee_blt.ry[1];
+        }
+}
+
+static void banshee_do_2d_blit(voodoo_t *voodoo, int count, uint32_t data)
+{
+        switch (voodoo->banshee_blt.command & COMMAND_CMD_MASK)
+        {
+                case COMMAND_CMD_NOP:
+                break;
+                        
+                case COMMAND_CMD_SCREEN_TO_SCREEN_BLT:
+                banshee_do_screen_to_screen_blt(voodoo);
+                break;
+                
+                case COMMAND_CMD_SCREEN_TO_SCREEN_STRETCH_BLT:
+                banshee_do_screen_to_screen_stretch_blt(voodoo);
+                break;
+
+                case COMMAND_CMD_HOST_TO_SCREEN_BLT:
+                banshee_do_host_to_screen_blt(voodoo, count, data);
+                break;
+                
+                case COMMAND_CMD_HOST_TO_SCREEN_STRETCH_BLT:
+                banshee_do_host_to_screen_stretch_blt(voodoo, count, data);
+                break;
+
+                case COMMAND_CMD_RECTFILL:
+                banshee_do_rectfill(voodoo);
+                break;
+
+                case COMMAND_CMD_LINE:
+                banshee_do_line(voodoo, 1);
+                break;
+
+                case COMMAND_CMD_POLYLINE:
+                banshee_do_line(voodoo, 0);
+                break;
+
+#ifndef RELEASE_BUILD
+                default:
+                fatal("banshee_do_2d_blit: unknown command=%08x\n", voodoo->banshee_blt.command);
+#endif
+        }
+}
+
+void voodoo_2d_reg_writel(voodoo_t *voodoo, uint32_t addr, uint32_t val)
+{
+//        /*if ((addr & 0x1fc) != 0x80) */pclog("2D reg write %03x %08x\n", addr & 0x1fc, val);
+        switch (addr & 0x1fc)
+        {
+                case 0x08:
+                voodoo->banshee_blt.clip0Min = val;
+                voodoo->banshee_blt.clip[0].x_min = val & 0xfff;
+                voodoo->banshee_blt.clip[0].y_min = (val >> 16) & 0xfff;
+                break;
+                case 0x0c:
+                voodoo->banshee_blt.clip0Max = val;
+                voodoo->banshee_blt.clip[0].x_max = val & 0xfff;
+                voodoo->banshee_blt.clip[0].y_max = (val >> 16) & 0xfff;
+                break;
+                case 0x10:
+                voodoo->banshee_blt.dstBaseAddr = val & 0xffffff;
+                voodoo->banshee_blt.dstBaseAddr_tiled = val & 0x80000000;
+                if (voodoo->banshee_blt.dstBaseAddr_tiled)
+                        voodoo->banshee_blt.dst_stride = (voodoo->banshee_blt.dstFormat & DST_FORMAT_STRIDE_MASK) * 128*32;
+                else
+                        voodoo->banshee_blt.dst_stride = voodoo->banshee_blt.dstFormat & DST_FORMAT_STRIDE_MASK;
+//                pclog("dstBaseAddr=%08x\n", val);
+                break;
+                case 0x14:
+                voodoo->banshee_blt.dstFormat = val;
+                if (voodoo->banshee_blt.dstBaseAddr_tiled)
+                        voodoo->banshee_blt.dst_stride = (voodoo->banshee_blt.dstFormat & DST_FORMAT_STRIDE_MASK) * 128*32;
+                else
+                        voodoo->banshee_blt.dst_stride = voodoo->banshee_blt.dstFormat & DST_FORMAT_STRIDE_MASK;
+//                pclog("dstFormat=%08x\n", val);
+                break;
+                
+                case 0x18:
+                voodoo->banshee_blt.srcColorkeyMin = val & 0xffffff;
+                break;
+                case 0x1c:
+                voodoo->banshee_blt.srcColorkeyMax = val & 0xffffff;
+                break;
+                case 0x20:
+                voodoo->banshee_blt.dstColorkeyMin = val & 0xffffff;
+                break;
+                case 0x24:
+                voodoo->banshee_blt.dstColorkeyMax = val & 0xffffff;
+                break;
+                
+                case 0x28:
+                voodoo->banshee_blt.bresError0 = val;
+                voodoo->banshee_blt.bres_error_0 = val & 0xffff;
+                break;
+                case 0x2c:
+                voodoo->banshee_blt.bresError1 = val;
+                voodoo->banshee_blt.bres_error_1 = val & 0xffff;
+                break;
+                
+                case 0x30:
+                voodoo->banshee_blt.rop = val;
+                voodoo->banshee_blt.rops[1] = val & 0xff;
+                voodoo->banshee_blt.rops[2] = (val >> 8) & 0xff;
+                voodoo->banshee_blt.rops[3] = (val >> 16) & 0xff;
+//                pclog("rop=%08x\n", val);
+                break;
+                case 0x34:
+                voodoo->banshee_blt.srcBaseAddr = val & 0xffffff;
+                voodoo->banshee_blt.srcBaseAddr_tiled = val & 0x80000000;
+                if (voodoo->banshee_blt.srcBaseAddr_tiled)
+                        voodoo->banshee_blt.src_stride = (voodoo->banshee_blt.srcFormat & SRC_FORMAT_STRIDE_MASK) * 128*32;
+                else
+                        voodoo->banshee_blt.src_stride = voodoo->banshee_blt.srcFormat & SRC_FORMAT_STRIDE_MASK;
+                update_src_stride(voodoo);
+//                pclog("srcBaseAddr=%08x\n", val);
+                break;
+                case 0x38:
+                voodoo->banshee_blt.commandExtra = val;
+//                pclog("commandExtra=%08x\n", val);
+                break;
+                case 0x3c:
+                voodoo->banshee_blt.lineStipple = val;
+                break;
+                case 0x40:
+                voodoo->banshee_blt.lineStyle = val;
+                voodoo->banshee_blt.line_rep_cnt = val & 0xff;
+                voodoo->banshee_blt.line_bit_mask_size = (val >> 8) & 0x1f;
+                voodoo->banshee_blt.line_pix_pos = (val >> 16) & 0xff;
+                voodoo->banshee_blt.line_bit_pos = (val >> 24) & 0x1f;
+                break;
+                case 0x44:
+                voodoo->banshee_blt.colorPattern[0] = val;
+//                pclog("colorPattern0=%08x\n", val);
+                voodoo->banshee_blt.colorPattern24[0] = val & 0xffffff;
+                voodoo->banshee_blt.colorPattern24[1] = (voodoo->banshee_blt.colorPattern24[1] & 0xffff00) | (val >> 24);
+                voodoo->banshee_blt.colorPattern16[0] = val & 0xffff;
+                voodoo->banshee_blt.colorPattern16[1] = (val >> 16) & 0xffff;
+                voodoo->banshee_blt.colorPattern8[0] = val & 0xff;
+                voodoo->banshee_blt.colorPattern8[1] = (val >> 8) & 0xff;
+                voodoo->banshee_blt.colorPattern8[2] = (val >> 16) & 0xff;
+                voodoo->banshee_blt.colorPattern8[3] = (val >> 24) & 0xff;
+                break;
+                case 0x48:
+                voodoo->banshee_blt.colorPattern[1] = val;
+//                pclog("colorPattern1=%08x\n", val);
+                voodoo->banshee_blt.colorPattern24[1] = (voodoo->banshee_blt.colorPattern24[1] & 0xff) | ((val & 0xffff) << 8);
+                voodoo->banshee_blt.colorPattern24[2] = (voodoo->banshee_blt.colorPattern24[2] & 0xff0000) | (val >> 16);
+                voodoo->banshee_blt.colorPattern16[2] = val & 0xffff;
+                voodoo->banshee_blt.colorPattern16[3] = (val >> 16) & 0xffff;
+                voodoo->banshee_blt.colorPattern8[4] = val & 0xff;
+                voodoo->banshee_blt.colorPattern8[5] = (val >> 8) & 0xff;
+                voodoo->banshee_blt.colorPattern8[6] = (val >> 16) & 0xff;
+                voodoo->banshee_blt.colorPattern8[7] = (val >> 24) & 0xff;
+                break;
+                case 0x4c:
+                voodoo->banshee_blt.clip1Min = val;
+                voodoo->banshee_blt.clip[1].x_min = val & 0xfff;
+                voodoo->banshee_blt.clip[1].y_min = (val >> 16) & 0xfff;
+                break;
+                case 0x50:
+                voodoo->banshee_blt.clip1Max = val;
+                voodoo->banshee_blt.clip[1].x_max = val & 0xfff;
+                voodoo->banshee_blt.clip[1].y_max = (val >> 16) & 0xfff;
+                break;
+                case 0x54:
+                voodoo->banshee_blt.srcFormat = val;
+                if (voodoo->banshee_blt.srcBaseAddr_tiled)
+                        voodoo->banshee_blt.src_stride = (voodoo->banshee_blt.srcFormat & SRC_FORMAT_STRIDE_MASK) * 128*32;
+                else
+                        voodoo->banshee_blt.src_stride = voodoo->banshee_blt.srcFormat & SRC_FORMAT_STRIDE_MASK;
+                update_src_stride(voodoo);
+                switch (voodoo->banshee_blt.srcFormat & SRC_FORMAT_COL_MASK)
+                {
+                        case SRC_FORMAT_COL_1_BPP:
+                        voodoo->banshee_blt.src_bpp = 1;
+                        break;
+                        case SRC_FORMAT_COL_8_BPP:
+                        voodoo->banshee_blt.src_bpp = 8;
+                        break;
+                        case SRC_FORMAT_COL_24_BPP:
+                        voodoo->banshee_blt.src_bpp = 24;
+                        break;
+                        case SRC_FORMAT_COL_32_BPP:
+                        voodoo->banshee_blt.src_bpp = 32;
+                        break;
+                        case SRC_FORMAT_COL_16_BPP: default:
+                        voodoo->banshee_blt.src_bpp = 16;
+                        break;
+                }
+//                pclog("srcFormat=%08x\n", val);
+                break;
+                case 0x58:
+                voodoo->banshee_blt.srcSize = val;
+                voodoo->banshee_blt.srcSizeX = voodoo->banshee_blt.srcSize & 0x1fff;
+                voodoo->banshee_blt.srcSizeY = (voodoo->banshee_blt.srcSize >> 16) & 0x1fff;
+                update_src_stride(voodoo);
+//                pclog("srcSize=%08x\n", val);
+                break;
+                case 0x5c:
+                voodoo->banshee_blt.srcXY = val;
+                voodoo->banshee_blt.srcX = ((int32_t)(val << 19)) >> 19;
+                voodoo->banshee_blt.srcY = ((int32_t)(val << 3)) >> 19;
+                update_src_stride(voodoo);
+//                pclog("srcXY=%08x\n", val);
+                break;
+                case 0x60:
+                voodoo->banshee_blt.colorBack = val;
+                break;
+                case 0x64:
+                voodoo->banshee_blt.colorFore = val;
+                break;
+                case 0x68:
+                voodoo->banshee_blt.dstSize = val;
+                voodoo->banshee_blt.dstSizeX = voodoo->banshee_blt.dstSize & 0x1fff;
+                voodoo->banshee_blt.dstSizeY = (voodoo->banshee_blt.dstSize >> 16) & 0x1fff;
+                update_src_stride(voodoo);
+//                pclog("dstSize=%08x\n", val);
+                break;
+                case 0x6c:
+                voodoo->banshee_blt.dstXY = val;
+                voodoo->banshee_blt.dstX = ((int32_t)(val << 19)) >> 19;
+                voodoo->banshee_blt.dstY = ((int32_t)(val << 3)) >> 19;
+//                pclog("dstXY=%08x\n", val);
+                break;
+                case 0x70:
+                voodoo_wait_for_render_thread_idle(voodoo);
+                voodoo->banshee_blt.command = val;
+                voodoo->banshee_blt.rops[0] = val >> 24;
+//                pclog("command=%x %08x\n", voodoo->banshee_blt.command & COMMAND_CMD_MASK, val);
+                voodoo->banshee_blt.patoff_x = (val & COMMAND_PATOFF_X_MASK) >> COMMAND_PATOFF_X_SHIFT;
+                voodoo->banshee_blt.patoff_y = (val & COMMAND_PATOFF_Y_MASK) >> COMMAND_PATOFF_Y_SHIFT;
+                voodoo->banshee_blt.cur_x = 0;
+                voodoo->banshee_blt.cur_y = 0;
+                voodoo->banshee_blt.dstX = ((int32_t)(voodoo->banshee_blt.dstXY << 19)) >> 19;
+                voodoo->banshee_blt.dstY = ((int32_t)(voodoo->banshee_blt.dstXY << 3)) >> 19;
+                voodoo->banshee_blt.srcX = ((int32_t)(voodoo->banshee_blt.srcXY << 19)) >> 19;
+                voodoo->banshee_blt.srcY = ((int32_t)(voodoo->banshee_blt.srcXY << 3)) >> 19;
+                voodoo->banshee_blt.old_srcX = voodoo->banshee_blt.srcX;
+                voodoo->banshee_blt.host_data_remainder = 0;
+                voodoo->banshee_blt.host_data_count = 0;
+                switch (voodoo->banshee_blt.command & COMMAND_CMD_MASK)
+                {
+/*                        case COMMAND_CMD_SCREEN_TO_SCREEN_STRETCH_BLT:
+                        if (voodoo->banshee_blt.bresError0 & BRES_ERROR_USE)
+                                voodoo->banshee_blt.bres_error_0 = (int32_t)(int16_t)(voodoo->banshee_blt.bresError0 & BRES_ERROR_MASK);
+                        else
+                                voodoo->banshee_blt.bres_error_0 = voodoo->banshee_blt.dstSizeY / 2;
+                        if (voodoo->banshee_blt.bresError1 & BRES_ERROR_USE)
+                                voodoo->banshee_blt.bres_error_1 = (int32_t)(int16_t)(voodoo->banshee_blt.bresError1 & BRES_ERROR_MASK);
+                        else
+                                voodoo->banshee_blt.bres_error_1 = voodoo->banshee_blt.dstSizeX / 2;
+
+                        if (val & COMMAND_INITIATE)
+                                banshee_do_2d_blit(voodoo, -1, 0);
+                        break;*/
+                        
+                        case COMMAND_CMD_POLYFILL:
+                        if (val & COMMAND_INITIATE)
+                        {
+                                voodoo->banshee_blt.dstXY = voodoo->banshee_blt.srcXY;
+                                voodoo->banshee_blt.dstX = voodoo->banshee_blt.srcX;
+                                voodoo->banshee_blt.dstY = voodoo->banshee_blt.srcY;
+                        }
+                        banshee_polyfill_start(voodoo);
+                        break;
+
+                        default:
+                        if (val & COMMAND_INITIATE)
+                        {
+                                banshee_do_2d_blit(voodoo, -1, 0);
+                        //       fatal("Initiate command!\n");
+                        }
+                        break;
+                }
+                break;
+                
+                case 0x80: case 0x84: case 0x88: case 0x8c:
+                case 0x90: case 0x94: case 0x98: case 0x9c:
+                case 0xa0: case 0xa4: case 0xa8: case 0xac:
+                case 0xb0: case 0xb4: case 0xb8: case 0xbc:
+                case 0xc0: case 0xc4: case 0xc8: case 0xcc:
+                case 0xd0: case 0xd4: case 0xd8: case 0xdc:
+                case 0xe0: case 0xe4: case 0xe8: case 0xec:
+                case 0xf0: case 0xf4: case 0xf8: case 0xfc:
+//                pclog("launch %08x  %08x %08x %08x\n", voodoo->banshee_blt.command,  voodoo->banshee_blt.commandExtra, voodoo->banshee_blt.srcColorkeyMin, voodoo->banshee_blt.srcColorkeyMax);
+                switch (voodoo->banshee_blt.command & COMMAND_CMD_MASK)
+                {
+                        case COMMAND_CMD_SCREEN_TO_SCREEN_BLT:
+                        voodoo->banshee_blt.srcXY = val;
+                        voodoo->banshee_blt.srcX = ((int32_t)(val << 19)) >> 19;
+                        voodoo->banshee_blt.srcY = ((int32_t)(val << 3)) >> 19;
+                        banshee_do_screen_to_screen_blt(voodoo);
+                        break;
+
+                        case COMMAND_CMD_HOST_TO_SCREEN_BLT:
+                        banshee_do_2d_blit(voodoo, 32, val);
+                        break;
+                        
+                        case COMMAND_CMD_HOST_TO_SCREEN_STRETCH_BLT:
+                        banshee_do_2d_blit(voodoo, 32, val);
+                        break;
+
+                        case COMMAND_CMD_RECTFILL:
+                        voodoo->banshee_blt.dstXY = val;
+                        voodoo->banshee_blt.dstX = ((int32_t)(val << 19)) >> 19;
+                        voodoo->banshee_blt.dstY = ((int32_t)(val << 3)) >> 19;
+                        banshee_do_rectfill(voodoo);
+                        break;
+
+                        case COMMAND_CMD_LINE:
+                        voodoo->banshee_blt.dstXY = val;
+                        voodoo->banshee_blt.dstX = ((int32_t)(val << 19)) >> 19;
+                        voodoo->banshee_blt.dstY = ((int32_t)(val << 3)) >> 19;
+                        banshee_do_line(voodoo, 1);
+                        break;
+
+                        case COMMAND_CMD_POLYLINE:
+                        voodoo->banshee_blt.dstXY = val;
+                        voodoo->banshee_blt.dstX = ((int32_t)(val << 19)) >> 19;
+                        voodoo->banshee_blt.dstY = ((int32_t)(val << 3)) >> 19;
+                        banshee_do_line(voodoo, 0);
+                        break;
+                        
+                        case COMMAND_CMD_POLYFILL:
+                        banshee_polyfill_continue(voodoo, val);
+                        break;
+
+#ifndef RELEASE_BUILD
+                        default:
+                        fatal("launch area write, command=%08x\n", voodoo->banshee_blt.command);
+#endif
+                }
+                break;
+
+                case 0x100: case 0x104: case 0x108: case 0x10c:
+                case 0x110: case 0x114: case 0x118: case 0x11c:
+                case 0x120: case 0x124: case 0x128: case 0x12c:
+                case 0x130: case 0x134: case 0x138: case 0x13c:
+                case 0x140: case 0x144: case 0x148: case 0x14c:
+                case 0x150: case 0x154: case 0x158: case 0x15c:
+                case 0x160: case 0x164: case 0x168: case 0x16c:
+                case 0x170: case 0x174: case 0x178: case 0x17c:
+                case 0x180: case 0x184: case 0x188: case 0x18c:
+                case 0x190: case 0x194: case 0x198: case 0x19c:
+                case 0x1a0: case 0x1a4: case 0x1a8: case 0x1ac:
+                case 0x1b0: case 0x1b4: case 0x1b8: case 0x1bc:
+                case 0x1c0: case 0x1c4: case 0x1c8: case 0x1cc:
+                case 0x1d0: case 0x1d4: case 0x1d8: case 0x1dc:
+                case 0x1e0: case 0x1e4: case 0x1e8: case 0x1ec:
+                case 0x1f0: case 0x1f4: case 0x1f8: case 0x1fc:
+                voodoo->banshee_blt.colorPattern[(addr >> 2) & 63] = val;
+                if ((addr & 0x1fc) < 0x1c0)
+                {
+                        int base_addr = (addr & 0xfc) / 0xc;
+                        uintptr_t src_p = (uintptr_t)&voodoo->banshee_blt.colorPattern[base_addr * 3];
+                        int col24 = base_addr * 4;
+
+                        voodoo->banshee_blt.colorPattern24[col24]     = *(uint32_t *)src_p & 0xffffff;
+                        voodoo->banshee_blt.colorPattern24[col24 + 1] = *(uint32_t *)(src_p + 3) & 0xffffff;
+                        voodoo->banshee_blt.colorPattern24[col24 + 2] = *(uint32_t *)(src_p + 6) & 0xffffff;
+                        voodoo->banshee_blt.colorPattern24[col24 + 3] = *(uint32_t *)(src_p + 9) & 0xffffff;
+                }
+                if ((addr & 0x1fc) < 0x180)
+                {
+                        voodoo->banshee_blt.colorPattern16[(addr >> 1) & 62]       = val & 0xffff;
+                        voodoo->banshee_blt.colorPattern16[((addr >> 1) & 62) + 1] = (val >> 16) & 0xffff;
+                }
+                if ((addr & 0x1fc) < 0x140)
+                {
+                        voodoo->banshee_blt.colorPattern8[addr & 60]       = val & 0xff;
+                        voodoo->banshee_blt.colorPattern8[(addr & 60) + 1] = (val >> 8) & 0xff;
+                        voodoo->banshee_blt.colorPattern8[(addr & 60) + 2] = (val >> 16) & 0xff;
+                        voodoo->banshee_blt.colorPattern8[(addr & 60) + 3] = (val >> 24) & 0xff;
+                }
+//                pclog("colorPattern%02x=%08x\n", (addr >> 2) & 63, val);
+                break;
+
+#ifndef RELEASE_BUILD
+                default:
+                fatal("Unknown 2D reg write %03x %08x\n", addr & 0x1fc, val);
+#endif
+        }
+}
diff --git a/pcem/vid_voodoo_banshee_blitter.h b/pcem/vid_voodoo_banshee_blitter.h
new file mode 100644 (file)
index 0000000..cc7a8f2
--- /dev/null
@@ -0,0 +1 @@
+void voodoo_2d_reg_writel(voodoo_t *voodoo, uint32_t addr, uint32_t val);
diff --git a/pcem/vid_voodoo_blitter.cpp b/pcem/vid_voodoo_blitter.cpp
new file mode 100644 (file)
index 0000000..01048f9
--- /dev/null
@@ -0,0 +1,507 @@
+#include <math.h>
+#include <stddef.h>
+#include "ibm.h"
+#include "device.h"
+#include "mem.h"
+#include "thread.h"
+#include "video.h"
+#include "vid_svga.h"
+#include "vid_voodoo.h"
+#include "vid_voodoo_common.h"
+#include "vid_voodoo_blitter.h"
+#include "vid_voodoo_dither.h"
+#include "vid_voodoo_regs.h"
+#include "vid_voodoo_render.h"
+
+enum
+{
+        BLIT_COMMAND_SCREEN_TO_SCREEN = 0,
+        BLIT_COMMAND_CPU_TO_SCREEN = 1,
+        BLIT_COMMAND_RECT_FILL = 2,
+        BLIT_COMMAND_SGRAM_FILL = 3
+};
+
+enum
+{
+        BLIT_SRC_1BPP             = (0 << 3),
+        BLIT_SRC_1BPP_BYTE_PACKED = (1 << 3),
+        BLIT_SRC_16BPP            = (2 << 3),
+        BLIT_SRC_24BPP            = (3 << 3),
+        BLIT_SRC_24BPP_DITHER_2X2 = (4 << 3),
+        BLIT_SRC_24BPP_DITHER_4X4 = (5 << 3)
+};
+
+enum
+{
+        BLIT_SRC_RGB_ARGB = (0 << 6),
+        BLIT_SRC_RGB_ABGR = (1 << 6),
+        BLIT_SRC_RGB_RGBA = (2 << 6),
+        BLIT_SRC_RGB_BGRA = (3 << 6)
+};
+
+enum
+{
+        BLIT_COMMAND_MASK = 7,
+        BLIT_SRC_FORMAT = (7 << 3),
+        BLIT_SRC_RGB_FORMAT = (3 << 6),
+        BLIT_SRC_CHROMA = (1 << 10),
+        BLIT_DST_CHROMA = (1 << 12),
+        BLIT_CLIPPING_ENABLED = (1 << 16)
+};
+
+enum
+{
+        BLIT_ROP_DST_PASS = (1 << 0),
+        BLIT_ROP_SRC_PASS = (1 << 1)
+};
+
+#define MIX(src_dat, dst_dat, rop) \
+        switch (rop)                                                    \
+        {                                                               \
+                case 0x0: dst_dat = 0; break;                           \
+                case 0x1: dst_dat = ~(src_dat | dst_dat); break;        \
+                case 0x2: dst_dat = ~src_dat & dst_dat; break;          \
+                case 0x3: dst_dat = ~src_dat; break;                    \
+                case 0x4: dst_dat = src_dat & ~dst_dat; break;          \
+                case 0x5: dst_dat = ~dst_dat; break;                    \
+                case 0x6: dst_dat = src_dat ^ dst_dat; break;           \
+                case 0x7: dst_dat = ~(src_dat & dst_dat); break;        \
+                case 0x8: dst_dat = src_dat & dst_dat; break;           \
+                case 0x9: dst_dat = ~(src_dat ^ dst_dat); break;        \
+                case 0xa: dst_dat = dst_dat; break;                     \
+                case 0xb: dst_dat = ~src_dat | dst_dat; break;          \
+                case 0xc: dst_dat = src_dat; break;                     \
+                case 0xd: dst_dat = src_dat | ~dst_dat; break;          \
+                case 0xe: dst_dat = src_dat | dst_dat; break;           \
+                case 0xf: dst_dat = 0xffff; break;                      \
+        }
+
+void voodoo_v2_blit_start(voodoo_t *voodoo)
+{
+        uint64_t dat64;
+        int size_x = ABS(voodoo->bltSizeX), size_y = ABS(voodoo->bltSizeY);
+        int x_dir = (voodoo->bltSizeX > 0) ? 1 : -1;
+        int y_dir = (voodoo->bltSizeY > 0) ? 1 : -1;
+        int dst_x;
+        int src_y = voodoo->bltSrcY & 0x7ff, dst_y = voodoo->bltDstY & 0x7ff;
+        int src_stride = (voodoo->bltCommand & BLTCMD_SRC_TILED) ? ((voodoo->bltSrcXYStride & 0x3f) * 32*2) : (voodoo->bltSrcXYStride & 0xff8);
+        int dst_stride = (voodoo->bltCommand & BLTCMD_DST_TILED) ? ((voodoo->bltDstXYStride & 0x3f) * 32*2) : (voodoo->bltDstXYStride & 0xff8);
+        uint32_t src_base_addr = (voodoo->bltCommand & BLTCMD_SRC_TILED) ? ((voodoo->bltSrcBaseAddr & 0x3ff) << 12) : (voodoo->bltSrcBaseAddr & 0x3ffff8);
+        uint32_t dst_base_addr = (voodoo->bltCommand & BLTCMD_DST_TILED) ? ((voodoo->bltDstBaseAddr & 0x3ff) << 12) : (voodoo->bltDstBaseAddr & 0x3ffff8);
+        int x, y;
+
+/*        pclog("blit_start: command=%08x srcX=%i srcY=%i dstX=%i dstY=%i sizeX=%i sizeY=%i color=%04x,%04x\n",
+                voodoo->bltCommand, voodoo->bltSrcX, voodoo->bltSrcY, voodoo->bltDstX, voodoo->bltDstY, voodoo->bltSizeX, voodoo->bltSizeY, voodoo->bltColorFg, voodoo->bltColorBg);*/
+
+        voodoo_wait_for_render_thread_idle(voodoo);
+
+        switch (voodoo->bltCommand & BLIT_COMMAND_MASK)
+        {
+                case BLIT_COMMAND_SCREEN_TO_SCREEN:
+                for (y = 0; y <= size_y; y++)
+                {
+                        uint16_t *src = (uint16_t *)&voodoo->fb_mem[src_base_addr + src_y*src_stride];
+                        uint16_t *dst = (uint16_t *)&voodoo->fb_mem[dst_base_addr + dst_y*dst_stride];
+                        int src_x = voodoo->bltSrcX, dst_x = voodoo->bltDstX;
+
+                        for (x = 0; x <= size_x; x++)
+                        {
+                                uint16_t src_dat = src[src_x];
+                                uint16_t dst_dat = dst[dst_x];
+                                int rop = 0;
+
+                                if (voodoo->bltCommand & BLIT_CLIPPING_ENABLED)
+                                {
+                                        if (dst_x < voodoo->bltClipLeft || dst_x >= voodoo->bltClipRight ||
+                                            dst_y < voodoo->bltClipLowY || dst_y >= voodoo->bltClipHighY)
+                                                goto skip_pixel_blit;
+                                }
+
+                                if (voodoo->bltCommand & BLIT_SRC_CHROMA)
+                                {
+                                        int r = (src_dat >> 11);
+                                        int g = (src_dat >> 5) & 0x3f;
+                                        int b = src_dat & 0x1f;
+
+                                        if (r >= voodoo->bltSrcChromaMinR && r <= voodoo->bltSrcChromaMaxR &&
+                                            g >= voodoo->bltSrcChromaMinG && g <= voodoo->bltSrcChromaMaxG &&
+                                            b >= voodoo->bltSrcChromaMinB && b <= voodoo->bltSrcChromaMaxB)
+                                                rop |= BLIT_ROP_SRC_PASS;
+                                }
+                                if (voodoo->bltCommand & BLIT_DST_CHROMA)
+                                {
+                                        int r = (dst_dat >> 11);
+                                        int g = (dst_dat >> 5) & 0x3f;
+                                        int b = dst_dat & 0x1f;
+
+                                        if (r >= voodoo->bltDstChromaMinR && r <= voodoo->bltDstChromaMaxR &&
+                                            g >= voodoo->bltDstChromaMinG && g <= voodoo->bltDstChromaMaxG &&
+                                            b >= voodoo->bltDstChromaMinB && b <= voodoo->bltDstChromaMaxB)
+                                                rop |= BLIT_ROP_DST_PASS;
+                                }
+
+                                MIX(src_dat, dst_dat, voodoo->bltRop[rop]);
+
+                                dst[dst_x] = dst_dat;
+skip_pixel_blit:
+                                src_x += x_dir;
+                                dst_x += x_dir;
+                        }
+
+                        src_y += y_dir;
+                        dst_y += y_dir;
+                }
+                break;
+
+                case BLIT_COMMAND_CPU_TO_SCREEN:
+                voodoo->blt.dst_x = voodoo->bltDstX;
+                voodoo->blt.dst_y = voodoo->bltDstY;
+                voodoo->blt.cur_x = 0;
+                voodoo->blt.size_x = size_x;
+                voodoo->blt.size_y = size_y;
+                voodoo->blt.x_dir = x_dir;
+                voodoo->blt.y_dir = y_dir;
+                voodoo->blt.dst_stride = (voodoo->bltCommand & BLTCMD_DST_TILED) ? ((voodoo->bltDstXYStride & 0x3f) * 32*2) : (voodoo->bltDstXYStride & 0xff8);
+                break;
+
+                case BLIT_COMMAND_RECT_FILL:
+                for (y = 0; y <= size_y; y++)
+                {
+                        uint16_t *dst;
+                        int dst_x = voodoo->bltDstX;
+
+                        if (SLI_ENABLED)
+                        {
+                                if ((!(voodoo->initEnable & INITENABLE_SLI_MASTER_SLAVE) && (voodoo->blt.dst_y & 1)) ||
+                                    ((voodoo->initEnable & INITENABLE_SLI_MASTER_SLAVE) && !(voodoo->blt.dst_y & 1)))
+                                        goto skip_line_fill;
+                                dst = (uint16_t *)&voodoo->fb_mem[dst_base_addr + (dst_y >> 1) * dst_stride];
+                        }
+                        else
+                                dst = (uint16_t *)&voodoo->fb_mem[dst_base_addr + dst_y*dst_stride];
+
+                        for (x = 0; x <= size_x; x++)
+                        {
+                                if (voodoo->bltCommand & BLIT_CLIPPING_ENABLED)
+                                {
+                                        if (dst_x < voodoo->bltClipLeft || dst_x >= voodoo->bltClipRight ||
+                                            dst_y < voodoo->bltClipLowY || dst_y >= voodoo->bltClipHighY)
+                                                goto skip_pixel_fill;
+                                }
+
+                                dst[dst_x] = voodoo->bltColorFg;
+skip_pixel_fill:
+                                dst_x += x_dir;
+                        }
+skip_line_fill:
+                        dst_y += y_dir;
+                }
+                break;
+
+                case BLIT_COMMAND_SGRAM_FILL:
+                /*32x32 tiles - 2kb*/
+                dst_y = voodoo->bltDstY & 0x3ff;
+                size_x = voodoo->bltSizeX & 0x1ff; //512*8 = 4kb
+                size_y = voodoo->bltSizeY & 0x3ff;
+
+                dat64 = voodoo->bltColorFg | ((uint64_t)voodoo->bltColorFg << 16) |
+                        ((uint64_t)voodoo->bltColorFg << 32) | ((uint64_t)voodoo->bltColorFg << 48);
+
+                for (y = 0; y <= size_y; y++)
+                {
+                        uint64_t *dst;
+
+                        /*This may be wrong*/
+                        if (!y)
+                        {
+                                dst_x = voodoo->bltDstX & 0x1ff;
+                                size_x = 511 - dst_x;
+                        }
+                        else if (y < size_y)
+                        {
+                                dst_x = 0;
+                                size_x = 511;
+                        }
+                        else
+                        {
+                                dst_x = 0;
+                                size_x = voodoo->bltSizeX & 0x1ff;
+                        }
+
+                        dst = (uint64_t *)&voodoo->fb_mem[(dst_y*512*8 + dst_x*8) & voodoo->fb_mask];
+
+                        for (x = 0; x <= size_x; x++)
+                                dst[x] = dat64;
+
+                        dst_y++;
+                }
+                break;
+
+                default:
+                fatal("bad blit command %08x\n", voodoo->bltCommand);
+        }
+}
+
+void voodoo_v2_blit_data(voodoo_t *voodoo, uint32_t data)
+{
+        int src_bits = 32;
+        uint32_t base_addr = (voodoo->bltCommand & BLTCMD_DST_TILED) ? ((voodoo->bltDstBaseAddr & 0x3ff) << 12) : (voodoo->bltDstBaseAddr & 0x3ffff8);
+        uint32_t addr;
+        uint16_t *dst;
+
+        if ((voodoo->bltCommand & BLIT_COMMAND_MASK) != BLIT_COMMAND_CPU_TO_SCREEN)
+                return;
+
+        if (SLI_ENABLED)
+        {
+                addr = base_addr + (voodoo->blt.dst_y >> 1) * voodoo->blt.dst_stride;
+                dst = (uint16_t *)&voodoo->fb_mem[addr];
+        }
+        else
+        {
+                addr = base_addr + voodoo->blt.dst_y*voodoo->blt.dst_stride;
+                dst = (uint16_t *)&voodoo->fb_mem[addr];
+        }
+
+        if (addr >= voodoo->front_offset && voodoo->row_width)
+        {
+                int y = (addr - voodoo->front_offset) / voodoo->row_width;
+                if (y < voodoo->v_disp)
+                        voodoo->dirty_line[y] = 2;
+        }
+
+        while (src_bits && voodoo->blt.cur_x <= voodoo->blt.size_x)
+        {
+                int r = 0, g = 0, b = 0;
+                uint16_t src_dat = 0, dst_dat;
+                int x = (voodoo->blt.x_dir > 0) ? (voodoo->blt.dst_x + voodoo->blt.cur_x) : (voodoo->blt.dst_x - voodoo->blt.cur_x);
+                int rop = 0;
+
+                switch (voodoo->bltCommand & BLIT_SRC_FORMAT)
+                {
+                        case BLIT_SRC_1BPP: case BLIT_SRC_1BPP_BYTE_PACKED:
+                        src_dat = (data & 1) ? voodoo->bltColorFg : voodoo->bltColorBg;
+                        data >>= 1;
+                        src_bits--;
+                        break;
+                        case BLIT_SRC_16BPP:
+                        switch (voodoo->bltCommand & BLIT_SRC_RGB_FORMAT)
+                        {
+                                case BLIT_SRC_RGB_ARGB: case BLIT_SRC_RGB_RGBA:
+                                src_dat = data & 0xffff;
+                                break;
+                                case BLIT_SRC_RGB_ABGR: case BLIT_SRC_RGB_BGRA:
+                                src_dat = ((data & 0xf800) >> 11) | (data & 0x07c0) | ((data & 0x0038) << 11);
+                                break;
+                        }
+                        data >>= 16;
+                        src_bits -= 16;
+                        break;
+                        case BLIT_SRC_24BPP: case BLIT_SRC_24BPP_DITHER_2X2: case BLIT_SRC_24BPP_DITHER_4X4:
+                        switch (voodoo->bltCommand & BLIT_SRC_RGB_FORMAT)
+                        {
+                                case BLIT_SRC_RGB_ARGB:
+                                r = (data >> 16) & 0xff;
+                                g = (data >> 8) & 0xff;
+                                b = data & 0xff;
+                                break;
+                                case BLIT_SRC_RGB_ABGR:
+                                r = data & 0xff;
+                                g = (data >> 8) & 0xff;
+                                b = (data >> 16) & 0xff;
+                                break;
+                                case BLIT_SRC_RGB_RGBA:
+                                r = (data >> 24) & 0xff;
+                                g = (data >> 16) & 0xff;
+                                b = (data >> 8) & 0xff;
+                                break;
+                                case BLIT_SRC_RGB_BGRA:
+                                r = (data >> 8) & 0xff;
+                                g = (data >> 16) & 0xff;
+                                b = (data >> 24) & 0xff;
+                                break;
+                        }
+                        switch (voodoo->bltCommand & BLIT_SRC_FORMAT)
+                        {
+                                case BLIT_SRC_24BPP:
+                                src_dat = (b >> 3) | ((g & 0xfc) << 3) | ((r & 0xf8) << 8);
+                                break;
+                                case BLIT_SRC_24BPP_DITHER_2X2:
+                                r = dither_rb2x2[r][voodoo->blt.dst_y & 1][x & 1];
+                                g =  dither_g2x2[g][voodoo->blt.dst_y & 1][x & 1];
+                                b = dither_rb2x2[b][voodoo->blt.dst_y & 1][x & 1];
+                                src_dat = (b >> 3) | ((g & 0xfc) << 3) | ((r & 0xf8) << 8);
+                                break;
+                                case BLIT_SRC_24BPP_DITHER_4X4:
+                                r = dither_rb[r][voodoo->blt.dst_y & 3][x & 3];
+                                g =  dither_g[g][voodoo->blt.dst_y & 3][x & 3];
+                                b = dither_rb[b][voodoo->blt.dst_y & 3][x & 3];
+                                src_dat = (b >> 3) | ((g & 0xfc) << 3) | ((r & 0xf8) << 8);
+                                break;
+                        }
+                        src_bits = 0;
+                        break;
+                }
+
+                if (SLI_ENABLED)
+                {
+                        if ((!(voodoo->initEnable & INITENABLE_SLI_MASTER_SLAVE) && (voodoo->blt.dst_y & 1)) ||
+                            ((voodoo->initEnable & INITENABLE_SLI_MASTER_SLAVE) && !(voodoo->blt.dst_y & 1)))
+                                goto skip_pixel;
+                }
+
+                if (voodoo->bltCommand & BLIT_CLIPPING_ENABLED)
+                {
+                        if (x < voodoo->bltClipLeft || x >= voodoo->bltClipRight ||
+                            voodoo->blt.dst_y < voodoo->bltClipLowY || voodoo->blt.dst_y >= voodoo->bltClipHighY)
+                                goto skip_pixel;
+                }
+
+                dst_dat = dst[x];
+
+                if (voodoo->bltCommand & BLIT_SRC_CHROMA)
+                {
+                        r = (src_dat >> 11);
+                        g = (src_dat >> 5) & 0x3f;
+                        b = src_dat & 0x1f;
+
+                        if (r >= voodoo->bltSrcChromaMinR && r <= voodoo->bltSrcChromaMaxR &&
+                            g >= voodoo->bltSrcChromaMinG && g <= voodoo->bltSrcChromaMaxG &&
+                            b >= voodoo->bltSrcChromaMinB && b <= voodoo->bltSrcChromaMaxB)
+                                rop |= BLIT_ROP_SRC_PASS;
+                }
+                if (voodoo->bltCommand & BLIT_DST_CHROMA)
+                {
+                        r = (dst_dat >> 11);
+                        g = (dst_dat >> 5) & 0x3f;
+                        b = dst_dat & 0x1f;
+
+                        if (r >= voodoo->bltDstChromaMinR && r <= voodoo->bltDstChromaMaxR &&
+                            g >= voodoo->bltDstChromaMinG && g <= voodoo->bltDstChromaMaxG &&
+                            b >= voodoo->bltDstChromaMinB && b <= voodoo->bltDstChromaMaxB)
+                                rop |= BLIT_ROP_DST_PASS;
+                }
+
+                MIX(src_dat, dst_dat, voodoo->bltRop[rop]);
+
+                dst[x] = dst_dat;
+
+skip_pixel:
+                voodoo->blt.cur_x++;
+        }
+
+        if (voodoo->blt.cur_x > voodoo->blt.size_x)
+        {
+                voodoo->blt.size_y--;
+                if (voodoo->blt.size_y >= 0)
+                {
+                        voodoo->blt.cur_x = 0;
+                        voodoo->blt.dst_y += voodoo->blt.y_dir;
+                }
+        }
+}
+
+
+void voodoo_fastfill(voodoo_t *voodoo, voodoo_params_t *params)
+{
+        int y;
+        int low_y, high_y;
+
+        if (params->fbzMode & (1 << 17))
+        {
+                high_y = voodoo->v_disp - params->clipLowY;
+                low_y = voodoo->v_disp - params->clipHighY;
+        }
+        else
+        {
+                low_y = params->clipLowY;
+                high_y = params->clipHighY;
+        }
+
+        if (params->fbzMode & FBZ_RGB_WMASK)
+        {
+                int r, g, b;
+                uint16_t col;
+
+                r = ((params->color1 >> 16) >> 3) & 0x1f;
+                g = ((params->color1 >> 8) >> 2) & 0x3f;
+                b = (params->color1 >> 3) & 0x1f;
+                col = b | (g << 5) | (r << 11);
+
+                if (SLI_ENABLED)
+                {
+                        for (y = low_y; y < high_y; y += 2)
+                        {
+                                uint16_t *cbuf = (uint16_t *)&voodoo->fb_mem[(params->draw_offset + (y >> 1) * voodoo->row_width) & voodoo->fb_mask];
+                                int x;
+
+                                for (x = params->clipLeft; x < params->clipRight; x++)
+                                        cbuf[x] = col;
+                        }
+                }
+                else
+                {
+                        for (y = low_y; y < high_y; y++)
+                        {
+                                if (voodoo->col_tiled)
+                                {
+                                        uint16_t *cbuf = (uint16_t *)&voodoo->fb_mem[(params->draw_offset + (y >> 5) * voodoo->row_width + (y & 31) * 128) & voodoo->fb_mask];
+                                        int x;
+
+                                        for (x = params->clipLeft; x < params->clipRight; x++)
+                                        {
+                                                int x2 = (x & 63) | ((x >> 6) * 128*32/2);
+                                                cbuf[x2] = col;
+                                        }
+                                }
+                                else
+                                {
+                                        uint16_t *cbuf = (uint16_t *)&voodoo->fb_mem[(params->draw_offset + y * voodoo->row_width) & voodoo->fb_mask];
+                                        int x;
+
+                                        for (x = params->clipLeft; x < params->clipRight; x++)
+                                                cbuf[x] = col;
+                                }
+                        }
+                }
+        }
+        if (params->fbzMode & FBZ_DEPTH_WMASK)
+        {
+                if (SLI_ENABLED)
+                {
+                        for (y = low_y; y < high_y; y += 2)
+                        {
+                                uint16_t *abuf = (uint16_t *)&voodoo->fb_mem[(params->aux_offset + (y >> 1) * voodoo->row_width) & voodoo->fb_mask];
+                                int x;
+
+                                for (x = params->clipLeft; x < params->clipRight; x++)
+                                        abuf[x] = params->zaColor & 0xffff;
+                        }
+                }
+                else
+                {
+                        for (y = low_y; y < high_y; y++)
+                        {
+                                if (voodoo->aux_tiled)
+                                {
+                                        uint16_t *abuf = (uint16_t *)&voodoo->fb_mem[(params->aux_offset + (y >> 5) * voodoo->aux_row_width + (y & 31) * 128) & voodoo->fb_mask];
+                                        int x;
+
+                                        for (x = params->clipLeft; x < params->clipRight; x++)
+                                        {
+                                                int x2 = (x & 63) | ((x >> 6) * 128*32/2);
+                                                abuf[x2] = params->zaColor & 0xffff;
+                                        }
+                                }
+                                else
+                                {
+                                        uint16_t *abuf = (uint16_t *)&voodoo->fb_mem[(params->aux_offset + y * voodoo->aux_row_width) & voodoo->fb_mask];
+                                        int x;
+
+                                        for (x = params->clipLeft; x < params->clipRight; x++)
+                                                abuf[x] = params->zaColor & 0xffff;
+                                }
+                        }
+                }
+        }
+}
diff --git a/pcem/vid_voodoo_blitter.h b/pcem/vid_voodoo_blitter.h
new file mode 100644 (file)
index 0000000..8d315df
--- /dev/null
@@ -0,0 +1,3 @@
+void voodoo_v2_blit_start(voodoo_t *voodoo);
+void voodoo_v2_blit_data(voodoo_t *voodoo, uint32_t data);
+void voodoo_fastfill(voodoo_t *voodoo, voodoo_params_t *params);
diff --git a/pcem/vid_voodoo_codegen_x86-64.h b/pcem/vid_voodoo_codegen_x86-64.h
new file mode 100644 (file)
index 0000000..35ed9e5
--- /dev/null
@@ -0,0 +1,3467 @@
+/*Registers :
+        
+  alphaMode
+  fbzMode & 0x1f3fff
+  fbzColorPath
+*/
+
+#if defined(__linux__) || defined(__APPLE__)
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+#if WIN64
+#define BITMAP windows_BITMAP
+#include <windows.h>
+#undef BITMAP
+#endif
+
+#include <xmmintrin.h>
+
+#define BLOCK_NUM 8
+#define BLOCK_MASK (BLOCK_NUM-1)
+#define BLOCK_SIZE 8192
+
+#define LOD_MASK (LOD_TMIRROR_S | LOD_TMIRROR_T)
+
+typedef struct voodoo_x86_data_t
+{
+        uint8_t code_block[BLOCK_SIZE];
+        int xdir;
+        uint32_t alphaMode;
+        uint32_t fbzMode;
+        uint32_t fogMode;
+        uint32_t fbzColorPath;
+        uint32_t textureMode[2];
+        uint32_t tLOD[2];
+        uint32_t trexInit1;        
+        int is_tiled;
+} voodoo_x86_data_t;
+
+//static voodoo_x86_data_t voodoo_x86_data[2][BLOCK_NUM];
+
+static int last_block[4] = {0, 0};
+static int next_block_to_write[4] = {0, 0};
+
+#define addbyte(val)                                            \
+        do {                                                    \
+                code_block[block_pos++] = val;                  \
+                if (block_pos >= BLOCK_SIZE)                    \
+                        fatal("Over!\n");                       \
+        } while (0)
+
+#define addword(val)                                            \
+        do {                                                    \
+                *(uint16_t *)&code_block[block_pos] = val;      \
+                block_pos += 2;                                 \
+                if (block_pos >= BLOCK_SIZE)                    \
+                        fatal("Over!\n");                       \
+        } while (0)
+
+#define addlong(val)                                            \
+        do {                                                    \
+                *(uint32_t *)&code_block[block_pos] = val;      \
+                block_pos += 4;                                 \
+                if (block_pos >= BLOCK_SIZE)                    \
+                        fatal("Over!\n");                       \
+        } while (0)
+
+#define addquad(val)                                            \
+        do {                                                    \
+                *(uint64_t *)&code_block[block_pos] = val;      \
+                block_pos += 8;                                 \
+                if (block_pos >= BLOCK_SIZE)                    \
+                        fatal("Over!\n");                       \
+        } while (0)
+
+
+static __m128i xmm_01_w;// = 0x0001000100010001ull;
+static __m128i xmm_ff_w;// = 0x00ff00ff00ff00ffull;
+static __m128i xmm_ff_b;// = 0x00000000ffffffffull;
+
+static __m128i alookup[257], aminuslookup[256];
+static __m128i minus_254;// = 0xff02ff02ff02ff02ull;
+static __m128i bilinear_lookup[256*2];
+static __m128i xmm_00_ff_w[2];
+static uint32_t i_00_ff_w[2] = {0, 0xff};
+
+static inline int codegen_texture_fetch(uint8_t *code_block, voodoo_t *voodoo, voodoo_params_t *params, voodoo_state_t *state, int block_pos, int tmu)
+{
+        if (params->textureMode[tmu] & 1)
+        {
+                addbyte(0x48); /*MOV RBX, state->tmu0_s*/
+                addbyte(0x8b);
+                addbyte(0x9f);
+                addlong(tmu ? offsetof(voodoo_state_t, tmu1_s) : offsetof(voodoo_state_t, tmu0_s));
+                addbyte(0x48); /*MOV RAX, (1 << 48)*/
+                addbyte(0xb8);
+                addquad(1ULL << 48);
+                addbyte(0x48); /*XOR RDX, RDX*/
+                addbyte(0x31);
+                addbyte(0xd2);
+                addbyte(0x48); /*MOV RCX, state->tmu0_t*/
+                addbyte(0x8b);
+                addbyte(0x8f);
+                addlong(tmu ? offsetof(voodoo_state_t, tmu1_t) : offsetof(voodoo_state_t, tmu0_t));
+                addbyte(0x48); /*CMP state->tmu_w, 0*/
+                addbyte(0x83);
+                addbyte(0xbf);
+                addlong(tmu ? offsetof(voodoo_state_t, tmu1_w) : offsetof(voodoo_state_t, tmu0_w));
+                addbyte(0);
+                addbyte(0x74); /*JZ +*/
+                addbyte(7);
+                addbyte(0x48); /*IDIV state->tmu_w*/
+                addbyte(0xf7);
+                addbyte(0xbf);
+                addlong(tmu ? offsetof(voodoo_state_t, tmu1_w) : offsetof(voodoo_state_t, tmu0_w));
+                addbyte(0x48); /*SAR RBX, 14*/
+                addbyte(0xc1);
+                addbyte(0xfb);
+                addbyte(14);
+                addbyte(0x48); /*SAR RCX, 14*/
+                addbyte(0xc1);
+                addbyte(0xf9);
+                addbyte(14);
+                addbyte(0x48); /*IMUL RBX, RAX*/
+                addbyte(0x0f);
+                addbyte(0xaf);
+                addbyte(0xd8);
+                addbyte(0x48); /*IMUL RCX, RAX*/
+                addbyte(0x0f);
+                addbyte(0xaf);
+                addbyte(0xc8);
+                addbyte(0x48); /*SAR RBX, 30*/
+                addbyte(0xc1);
+                addbyte(0xfb);
+                addbyte(30);
+                addbyte(0x48); /*SAR RCX, 30*/
+                addbyte(0xc1);
+                addbyte(0xf9);
+                addbyte(30);
+                addbyte(0x48); /*BSR EDX, RAX*/
+                addbyte(0x0f);
+                addbyte(0xbd);
+                addbyte(0xd0);
+                addbyte(0x48); /*SHL RAX, 8*/
+                addbyte(0xc1);
+                addbyte(0xe0);
+                addbyte(8);
+                addbyte(0x89); /*MOV state->tex_t, ECX*/
+                addbyte(0x8f);
+                addlong(offsetof(voodoo_state_t, tex_t));
+                addbyte(0x89); /*MOV ECX, EDX*/
+                addbyte(0xd1);
+                addbyte(0x83); /*SUB EDX, 19*/
+                addbyte(0xea);
+                addbyte(19);
+                addbyte(0x48); /*SHR RAX, CL*/
+                addbyte(0xd3);
+                addbyte(0xe8);
+                addbyte(0xc1); /*SHL EDX, 8*/
+                addbyte(0xe2);
+                addbyte(8);
+                addbyte(0x25); /*AND EAX, 0xff*/
+                addlong(0xff);
+                addbyte(0x89); /*MOV state->tex_s, EBX*/
+                addbyte(0x9f);
+                addlong(offsetof(voodoo_state_t, tex_s));
+                addbyte(0x41); /*MOVZX EAX, R9(logtable)[RAX]*/
+                addbyte(0x0f);
+                addbyte(0xb6);
+                addbyte(0x04);
+                addbyte(0x01);
+                addbyte(0x09); /*OR EAX, EDX*/
+                addbyte(0xd0);
+                addbyte(0x03); /*ADD EAX, state->lod*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, tmu[tmu].lod));
+                addbyte(0x3b); /*CMP EAX, state->lod_min*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, lod_min[tmu]));
+                addbyte(0x0f); /*CMOVL EAX, state->lod_min*/
+                addbyte(0x4c);
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, lod_min[tmu]));
+                addbyte(0x3b); /*CMP EAX, state->lod_max*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, lod_max[tmu]));
+                addbyte(0x0f); /*CMOVNL EAX, state->lod_max*/
+                addbyte(0x4d);
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, lod_max[tmu]));
+                addbyte(0xc1); /*SHR EAX, 8*/
+                addbyte(0xe8);
+                addbyte(8);        
+                addbyte(0x89); /*MOV state->lod, EAX*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, lod));
+        }
+        else
+        {
+                addbyte(0x48); /*MOV RAX, state->tmu0_s*/
+                addbyte(0x8b);
+                addbyte(0x87);
+                addlong(tmu ? offsetof(voodoo_state_t, tmu1_s) : offsetof(voodoo_state_t, tmu0_s));
+                addbyte(0x48); /*MOV RCX, state->tmu0_t*/
+                addbyte(0x8b);
+                addbyte(0x8f);
+                addlong(tmu ? offsetof(voodoo_state_t, tmu1_t) : offsetof(voodoo_state_t, tmu0_t));
+                addbyte(0x48); /*SHR RAX, 28*/
+                addbyte(0xc1);
+                addbyte(0xe8);
+                addbyte(28);
+                addbyte(0x8b); /*MOV EBX, state->lod_min*/
+                addbyte(0x9f);
+                addlong(offsetof(voodoo_state_t, lod_min[tmu]));
+                addbyte(0x48); /*SHR RCX, 28*/
+                addbyte(0xc1);
+                addbyte(0xe9);
+                addbyte(28);
+                addbyte(0x48); /*MOV state->tex_s, RAX*/
+                addbyte(0x89);
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, tex_s));
+                addbyte(0xc1); /*SHR EBX, 8*/
+                addbyte(0xeb);
+                addbyte(8);        
+                addbyte(0x48); /*MOV state->tex_t, RCX*/
+                addbyte(0x89);
+                addbyte(0x8f);
+                addlong(offsetof(voodoo_state_t, tex_t));
+                addbyte(0x89); /*MOV state->lod, EBX*/
+                addbyte(0x9f);
+                addlong(offsetof(voodoo_state_t, lod));
+        }
+
+        if (params->fbzColorPath & FBZCP_TEXTURE_ENABLED)
+        {
+                if (voodoo->bilinear_enabled && (params->textureMode[tmu] & 6))
+                {
+                        addbyte(0xb2); /*MOV DL, 8*/
+                        addbyte(8);
+                        addbyte(0x8b); /*MOV ECX, state->lod[RDI]*/
+                        addbyte(0x8f);
+                        addlong(offsetof(voodoo_state_t, lod));
+                        addbyte(0xbd); /*MOV EBP, 1*/
+                        addlong(1);
+                        addbyte(0x28); /*SUB DL, CL*/
+                        addbyte(0xca);
+//                        addbyte(0x8a); /*MOV DL, params->tex_shift[RSI+ECX*4]*/
+//                        addbyte(0x94);
+//                        addbyte(0x8e);
+//                        addlong(offsetof(voodoo_params_t, tex_shift));
+                        addbyte(0xd3); /*SHL EBP, CL*/
+                        addbyte(0xe5);
+                        addbyte(0x8b); /*MOV EAX, state->tex_s[RDI]*/
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, tex_s));
+                        addbyte(0xc1); /*SHL EBP, 3*/
+                        addbyte(0xe5);
+                        addbyte(3);
+                        addbyte(0x8b); /*MOV EBX, state->tex_t[RDI]*/
+                        addbyte(0x9f);
+                        addlong(offsetof(voodoo_state_t, tex_t));
+                        if (params->tLOD[tmu] & LOD_TMIRROR_S)
+                        {
+                                addbyte(0xa9); /*TEST EAX, 0x1000*/
+                                addlong(0x1000);
+                                addbyte(0x74); /*JZ +*/
+                                addbyte(2);
+                                addbyte(0xf7); /*NOT EAX*/
+                                addbyte(0xd0);
+                        }
+                        if (params->tLOD[tmu] & LOD_TMIRROR_T)
+                        {
+                                addbyte(0xf7); /*TEST EBX, 0x1000*/
+                                addbyte(0xc3);
+                                addlong(0x1000);
+                                addbyte(0x74); /*JZ +*/
+                                addbyte(2);
+                                addbyte(0xf7); /*NOT EBX*/
+                                addbyte(0xd3);
+                        }
+                        addbyte(0x29); /*SUB EAX, EBP*/
+                        addbyte(0xe8);
+                        addbyte(0x29); /*SUB EBX, EBP*/
+                        addbyte(0xeb);
+                        addbyte(0xd3); /*SAR EAX, CL*/
+                        addbyte(0xf8);
+                        addbyte(0xd3); /*SAR EBX, CL*/
+                        addbyte(0xfb);
+                        addbyte(0x89); /*MOV EBP, EAX*/
+                        addbyte(0xc5);
+                        addbyte(0x89); /*MOV ECX, EBX*/
+                        addbyte(0xd9);
+                        addbyte(0x83); /*AND EBP, 0xf*/
+                        addbyte(0xe5);
+                        addbyte(0xf);
+                        addbyte(0xc1); /*SHL ECX, 4*/
+                        addbyte(0xe1);
+                        addbyte(4);
+                        addbyte(0xc1); /*SAR EAX, 4*/
+                        addbyte(0xf8);
+                        addbyte(4);
+                        addbyte(0x81); /*AND ECX, 0xf0*/
+                        addbyte(0xe1);
+                        addlong(0xf0);
+                        addbyte(0xc1); /*SAR EBX, 4*/
+                        addbyte(0xfb);
+                        addbyte(4);
+                        addbyte(0x09); /*OR EBP, ECX*/
+                        addbyte(0xcd);
+                        addbyte(0x8b); /*MOV ECX, state->lod[RDI]*/
+                        addbyte(0x8f);
+                        addlong(offsetof(voodoo_state_t, lod));
+                        addbyte(0xc1); /*SHL EBP, 5*/
+                        addbyte(0xe5);
+                        addbyte(5);
+                        /*EAX = S, EBX = T, ECX = LOD, EDX = tex_shift, ESI=params, EDI=state, EBP = bilinear shift*/
+                        addbyte(0x48); /*LEA RSI, [RSI+RCX*4]*/
+                        addbyte(0x8d);
+                        addbyte(0x34);
+                        addbyte(0x8e);
+                        addbyte(0x89); /*MOV ebp_store, EBP*/
+                        addbyte(0xaf);
+                        addlong(offsetof(voodoo_state_t, ebp_store));
+                        addbyte(0x48); /*MOV RBP, state->tex[RDI+RCX*8]*/
+                        addbyte(0x8b);
+                        addbyte(0xac);
+                        addbyte(0xcf);
+                        addlong(offsetof(voodoo_state_t, tex[tmu]));
+                        addbyte(0x88); /*MOV CL, DL*/
+                        addbyte(0xd1);
+                        addbyte(0x89); /*MOV EDX, EBX*/
+                        addbyte(0xda);
+                        if (!state->clamp_s[tmu])
+                        {
+                                addbyte(0x23); /*AND EAX, params->tex_w_mask[ESI]*/
+                                addbyte(0x86);
+                                addlong(offsetof(voodoo_params_t, tex_w_mask[tmu]));
+                        }
+                        addbyte(0x83); /*ADD EDX, 1*/
+                        addbyte(0xc2);
+                        addbyte(1);
+                        if (state->clamp_t[tmu])
+                        {
+                                addbyte(0x41); /*CMOVS EDX, R10(alookup[0](zero))*/
+                                addbyte(0x0f);
+                                addbyte(0x48);
+                                addbyte(0x12);
+                                addbyte(0x3b); /*CMP EDX, params->tex_h_mask[ESI]*/
+                                addbyte(0x96);
+                                addlong(offsetof(voodoo_params_t, tex_h_mask[tmu]));
+                                addbyte(0x0f); /*CMOVA EDX, params->tex_h_mask[ESI]*/
+                                addbyte(0x47);
+                                addbyte(0x96);
+                                addlong(offsetof(voodoo_params_t, tex_h_mask[tmu]));
+                                addbyte(0x85); /*TEST EBX,EBX*/
+                                addbyte(0xdb);
+                                addbyte(0x41); /*CMOVS EBX, R10(alookup[0](zero))*/
+                                addbyte(0x0f);
+                                addbyte(0x48);
+                                addbyte(0x1a);
+                                addbyte(0x3b); /*CMP EBX, params->tex_h_mask[ESI]*/
+                                addbyte(0x9e);
+                                addlong(offsetof(voodoo_params_t, tex_h_mask[tmu]));
+                                addbyte(0x0f); /*CMOVA EBX, params->tex_h_mask[ESI]*/
+                                addbyte(0x47);
+                                addbyte(0x9e);
+                                addlong(offsetof(voodoo_params_t, tex_h_mask[tmu]));
+                        }
+                        else
+                        {
+                                addbyte(0x23); /*AND EDX, params->tex_h_mask[ESI]*/
+                                addbyte(0x96);
+                                addlong(offsetof(voodoo_params_t, tex_h_mask[tmu]));
+                                addbyte(0x23); /*AND EBX, params->tex_h_mask[ESI]*/
+                                addbyte(0x9e);
+                                addlong(offsetof(voodoo_params_t, tex_h_mask[tmu]));
+                        }
+                        /*EAX = S, EBX = T0, EDX = T1*/
+                        addbyte(0xd3); /*SHL EBX, CL*/
+                        addbyte(0xe3);
+                        addbyte(0xd3); /*SHL EDX, CL*/
+                        addbyte(0xe2);
+                        addbyte(0x48); /*LEA RBX,[RBP+RBX*4]*/
+                        addbyte(0x8d);
+                        addbyte(0x5c);
+                        addbyte(0x9d);
+                        addbyte(0);
+                        addbyte(0x48); /*LEA RDX,[RBP+RDX*4]*/
+                        addbyte(0x8d);
+                        addbyte(0x54);
+                        addbyte(0x95);
+                        addbyte(0);
+                        if (state->clamp_s[tmu])
+                        {
+                                addbyte(0x8b); /*MOV EBP, params->tex_w_mask[ESI]*/
+                                addbyte(0xae);
+                                addlong(offsetof(voodoo_params_t, tex_w_mask[tmu]));
+                                addbyte(0x85); /*TEST EAX, EAX*/
+                                addbyte(0xc0);
+                                addbyte(0x8b); /*MOV ebp_store2, RSI*/
+                                addbyte(0xb7);
+                                addlong(offsetof(voodoo_state_t, ebp_store));
+                                addbyte(0x41); /*CMOVS EAX, R10(alookup[0](zero))*/
+                                addbyte(0x0f);
+                                addbyte(0x48);
+                                addbyte(0x02);
+                                addbyte(0x78); /*JS + - clamp on 0*/
+                                addbyte(2+3+2+ 5+5+2);
+                                addbyte(0x3b); /*CMP EAX, EBP*/
+                                addbyte(0xc5);
+                                addbyte(0x0f); /*CMOVAE EAX, EBP*/
+                                addbyte(0x43);
+                                addbyte(0xc5);
+                                addbyte(0x73); /*JAE + - clamp on +*/
+                                addbyte(5+5+2);
+                        }
+                        else
+                        {
+                                addbyte(0x3b); /*CMP EAX, params->tex_w_mask[ESI] - is S at texture edge (ie will wrap/clamp)?*/
+                                addbyte(0x86);
+                                addlong(offsetof(voodoo_params_t, tex_w_mask[tmu]));
+                                addbyte(0x8b); /*MOV ebp_store2, ESI*/
+                                addbyte(0xb7);
+                                addlong(offsetof(voodoo_state_t, ebp_store));
+                                addbyte(0x74); /*JE +*/
+                                addbyte(5+5+2);
+                        }
+
+                        addbyte(0xf3); /*MOVQ XMM0, [RBX+RAX*4]*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0x04);
+                        addbyte(0x83);
+                        addbyte(0xf3); /*MOVQ XMM1, [RDX+RAX*4]*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0x0c);
+                        addbyte(0x82);
+
+                        if (state->clamp_s[tmu])
+                        {
+                                addbyte(0xeb); /*JMP +*/
+                                addbyte(5+5+4+4);
+
+                                /*S clamped - the two S coordinates are the same*/
+                                addbyte(0x66); /*MOVD XMM0, [RBX+RAX*4]*/
+                                addbyte(0x0f);
+                                addbyte(0x6e);
+                                addbyte(0x04);
+                                addbyte(0x83);
+                                addbyte(0x66); /*MOVD XMM1, [RDX+RAX*4]*/
+                                addbyte(0x0f);
+                                addbyte(0x6e);
+                                addbyte(0x0c);
+                                addbyte(0x82);
+                                addbyte(0x66); /*PUNPCKLDQ XMM0, XMM0*/
+                                addbyte(0x0f);
+                                addbyte(0x62);
+                                addbyte(0xc0);
+                                addbyte(0x66); /*PUNPCKLDQ XMM1, XMM1*/
+                                addbyte(0x0f);
+                                addbyte(0x62);
+                                addbyte(0xc9);
+                        }
+                        else
+                        {
+                                addbyte(0xeb); /*JMP +*/
+                                addbyte(5+5+5+5+6+6);
+
+                                /*S wrapped - the two S coordinates are not contiguous*/
+                                addbyte(0x66); /*MOVD XMM0, [RBX+EAX*4]*/
+                                addbyte(0x0f);
+                                addbyte(0x6e);
+                                addbyte(0x04);
+                                addbyte(0x83);
+                                addbyte(0x66); /*MOVD XMM1, [RDX+EAX*4]*/
+                                addbyte(0x0f);
+                                addbyte(0x6e);
+                                addbyte(0x0c);
+                                addbyte(0x82);
+                                addbyte(0x66); /*PINSRW XMM0, [RBX], 2*/
+                                addbyte(0x0f);
+                                addbyte(0xc4);
+                                addbyte(0x03);
+                                addbyte(0x02);
+                                addbyte(0x66); /*PINSRW XMM1, [RDX], 2*/
+                                addbyte(0x0f);
+                                addbyte(0xc4);
+                                addbyte(0x0a);
+                                addbyte(0x02);
+                                addbyte(0x66); /*PINSRW XMM0, 2[RBX], 3*/
+                                addbyte(0x0f);
+                                addbyte(0xc4);
+                                addbyte(0x43);
+                                addbyte(0x02);
+                                addbyte(0x03);
+                                addbyte(0x66); /*PINSRW XMM1, 2[RDX], 3*/
+                                addbyte(0x0f);
+                                addbyte(0xc4);
+                                addbyte(0x4a);
+                                addbyte(0x02);
+                                addbyte(0x03);
+                        }
+
+                        addbyte(0x49); /*MOV R8, bilinear_lookup*/
+                        addbyte(0xb8);
+                        addquad((uintptr_t)bilinear_lookup);
+
+                        addbyte(0x66); /*PUNPCKLBW XMM0, XMM2*/
+                        addbyte(0x0f);
+                        addbyte(0x60);
+                        addbyte(0xc2);
+                        addbyte(0x66); /*PUNPCKLBW XMM1, XMM2*/
+                        addbyte(0x0f);
+                        addbyte(0x60);
+                        addbyte(0xca);
+
+                        addbyte(0x4c); /*ADD RSI, R8*/
+                        addbyte(0x01);
+                        addbyte(0xc6);
+
+                        addbyte(0x66); /*PMULLW XMM0, bilinear_lookup[ESI]*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0x06);
+                        addbyte(0x66); /*PMULLW XMM1, bilinear_lookup[ESI]+0x10*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0x4e);
+                        addbyte(0x10);
+                        addbyte(0x66); /*PADDW XMM0, XMM1*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xc0 | 1 | (0 << 3));
+                        addbyte(0x66); /*MOV XMM1, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x6f);
+                        addbyte(0xc0 | 0 | (1 << 3));
+                        addbyte(0x66); /*PSRLDQ XMM0, 64*/
+                        addbyte(0x0f);
+                        addbyte(0x73);
+                        addbyte(0xd8);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM0, XMM1*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xc0 | 1 | (0 << 3));
+                        addbyte(0x66); /*PSRLW XMM0, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd0 | 0);
+                        addbyte(8);
+                        addbyte(0x66); /*PACKUSWB XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x67);
+                        addbyte(0xc0);
+                        
+                        addbyte(0x4c); /*MOV RSI, R15*/
+                        addbyte(0x89);
+                        addbyte(0xfe);
+
+                        addbyte(0x66); /*MOV EAX, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xc0);                        
+                }
+                else
+                {
+                        addbyte(0xb2); /*MOV DL, 8*/
+                        addbyte(8);
+                        addbyte(0x8b); /*MOV ECX, state->lod[RDI]*/
+                        addbyte(0x8f);
+                        addlong(offsetof(voodoo_state_t, lod));
+                        addbyte(0x48); /*MOV RBP, state->tex[RDI+RCX*8]*/
+                        addbyte(0x8b);
+                        addbyte(0xac);
+                        addbyte(0xcf);
+                        addlong(offsetof(voodoo_state_t, tex[tmu]));
+                        addbyte(0x28); /*SUB DL, CL*/
+                        addbyte(0xca);
+                        addbyte(0x80); /*ADD CL, 4*/
+                        addbyte(0xc1);
+                        addbyte(4);
+                        addbyte(0x8b); /*MOV EAX, state->tex_s[EDI]*/
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, tex_s));
+                        addbyte(0x8b); /*MOV EBX, state->tex_t[EDI]*/
+                        addbyte(0x9f);
+                        addlong(offsetof(voodoo_state_t, tex_t));
+                        if (params->tLOD[tmu] & LOD_TMIRROR_S)
+                        {
+                                addbyte(0xa9); /*TEST EAX, 0x1000*/
+                                addlong(0x1000);
+                                addbyte(0x74); /*JZ +*/
+                                addbyte(2);
+                                addbyte(0xf7); /*NOT EAX*/
+                                addbyte(0xd0);
+                        }
+                        if (params->tLOD[tmu] & LOD_TMIRROR_T)
+                        {
+                                addbyte(0xf7); /*TEST EBX, 0x1000*/
+                                addbyte(0xc3);
+                                addlong(0x1000);
+                                addbyte(0x74); /*JZ +*/
+                                addbyte(2);
+                                addbyte(0xf7); /*NOT EBX*/
+                                addbyte(0xd3);
+                        }
+                        addbyte(0xd3); /*SHR EAX, CL*/
+                        addbyte(0xe8);
+                        addbyte(0xd3); /*SHR EBX, CL*/
+                        addbyte(0xeb);
+                        if (state->clamp_s[tmu])
+                        {
+                                addbyte(0x85); /*TEST EAX, EAX*/
+                                addbyte(0xc0);
+                                addbyte(0x41); /*CMOVS EAX, R10(alookup[0](zero))*/
+                                addbyte(0x0f);
+                                addbyte(0x48);
+                                addbyte(0x02);
+                                addbyte(0x3b); /*CMP EAX, params->tex_w_mask[ESI+ECX*4]*/
+                                addbyte(0x84);
+                                addbyte(0x8e);
+                                addlong(offsetof(voodoo_params_t, tex_w_mask[tmu]) - 0x10);
+                                addbyte(0x0f); /*CMOVAE EAX, params->tex_w_mask[ESI+ECX*4]*/
+                                addbyte(0x43);
+                                addbyte(0x84);
+                                addbyte(0x8e);
+                                addlong(offsetof(voodoo_params_t, tex_w_mask[tmu]) - 0x10);
+
+                        }
+                        else
+                        {
+                                addbyte(0x23); /*AND EAX, params->tex_w_mask-0x10[ESI+ECX*4]*/
+                                addbyte(0x84);
+                                addbyte(0x8e);
+                                addlong(offsetof(voodoo_params_t, tex_w_mask[tmu]) - 0x10);
+                        }
+                        if (state->clamp_t[tmu])
+                        {
+                                addbyte(0x85); /*TEST EBX, EBX*/
+                                addbyte(0xdb);
+                                addbyte(0x41); /*CMOVS EBX, R10(alookup[0](zero))*/
+                                addbyte(0x0f);
+                                addbyte(0x48);
+                                addbyte(0x1a);
+                                addbyte(0x3b); /*CMP EBX, params->tex_h_mask[ESI+ECX*4]*/
+                                addbyte(0x9c);
+                                addbyte(0x8e);
+                                addlong(offsetof(voodoo_params_t, tex_h_mask[tmu]) - 0x10);
+                                addbyte(0x0f); /*CMOVAE EBX, params->tex_h_mask[ESI+ECX*4]*/
+                                addbyte(0x43);
+                                addbyte(0x9c);
+                                addbyte(0x8e);
+                                addlong(offsetof(voodoo_params_t, tex_h_mask[tmu]) - 0x10);
+                        }
+                        else
+                        {
+                                addbyte(0x23); /*AND EBX, params->tex_h_mask-0x10[ESI+ECX*4]*/
+                                addbyte(0x9c);
+                                addbyte(0x8e);
+                                addlong(offsetof(voodoo_params_t, tex_h_mask[tmu]) - 0x10);
+                        }
+                        addbyte(0x88); /*MOV CL, DL*/
+                        addbyte(0xd1);
+                        addbyte(0xd3); /*SHL EBX, CL*/
+                        addbyte(0xe3);
+                        addbyte(0x01); /*ADD EBX, EAX*/
+                        addbyte(0xc3);
+
+                        addbyte(0x8b); /*MOV EAX, [RBP+RBX*4]*/
+                        addbyte(0x44);
+                        addbyte(0x9d);
+                        addbyte(0);
+                }
+        }
+
+        return block_pos;
+}
+
+static inline void voodoo_generate(uint8_t *code_block, voodoo_t *voodoo, voodoo_params_t *params, voodoo_state_t *state, int depthop)
+{        
+        int block_pos = 0;
+        int z_skip_pos = 0;
+        int a_skip_pos = 0;
+        int chroma_skip_pos = 0;
+        int depth_jump_pos = 0;
+        int depth_jump_pos2 = 0;
+        int loop_jump_pos = 0;
+//        xmm_01_w = (__m128i)0x0001000100010001ull;
+//        xmm_ff_w = (__m128i)0x00ff00ff00ff00ffull;
+//        xmm_ff_b = (__m128i)0x00000000ffffffffull;
+        xmm_01_w = _mm_set_epi32(0, 0, 0x00010001, 0x00010001);
+        xmm_ff_w = _mm_set_epi32(0, 0, 0x00ff00ff, 0x00ff00ff);
+        xmm_ff_b = _mm_set_epi32(0, 0, 0, 0x00ffffff);
+        minus_254 = _mm_set_epi32(0, 0, 0xff02ff02, 0xff02ff02);
+//        *(uint64_t *)&const_1_48 = 0x45b0000000000000ull;
+//        block_pos = 0;
+//        voodoo_get_depth = &code_block[block_pos];
+        /*W at (%esp+4)
+          Z at (%esp+12)
+          new_depth at (%esp+16)*/
+//        if ((params->fbzMode & FBZ_DEPTH_ENABLE) && (depth_op == DEPTHOP_NEVER))
+//        {
+//                addbyte(0xC3); /*RET*/
+//                return;
+//        }
+        addbyte(0x55); /*PUSH RBP*/
+        addbyte(0x57); /*PUSH RDI*/
+        addbyte(0x56); /*PUSH RSI*/
+        addbyte(0x53); /*PUSH RBX*/
+        addbyte(0x41); /*PUSH R12*/
+        addbyte(0x54);
+        addbyte(0x41); /*PUSH R13*/
+        addbyte(0x55);
+        addbyte(0x41); /*PUSH R14*/
+        addbyte(0x56);
+        addbyte(0x41); /*PUSH R15*/
+        addbyte(0x57);
+        
+        addbyte(0x49); /*MOV R15, xmm_01_w*/
+        addbyte(0xbf);
+        addquad((uint64_t)(uintptr_t)&xmm_01_w);
+        addbyte(0x66); /*MOVDQA XMM8, [R15]*/
+        addbyte(0x45);
+        addbyte(0x0f);
+        addbyte(0x6f);
+        addbyte(0x07 | (0 << 3));
+        addbyte(0x49); /*MOV R15, xmm_ff_w*/
+        addbyte(0xbf);
+        addquad((uint64_t)(uintptr_t)&xmm_ff_w);
+        addbyte(0x66); /*MOVDQA XMM9, [R15]*/
+        addbyte(0x45);
+        addbyte(0x0f);
+        addbyte(0x6f);
+        addbyte(0x07 | (1 << 3));
+        addbyte(0x49); /*MOV R15, xmm_ff_b*/
+        addbyte(0xbf);
+        addquad((uint64_t)(uintptr_t)&xmm_ff_b);
+        addbyte(0x66); /*MOVDQA XMM10, [R15]*/
+        addbyte(0x45);
+        addbyte(0x0f);
+        addbyte(0x6f);
+        addbyte(0x07 | (2 << 3));
+        addbyte(0x49); /*MOV R15, minus_254*/
+        addbyte(0xbf);
+        addquad((uint64_t)(uintptr_t)&minus_254);
+        addbyte(0x66); /*MOVDQA XMM11, [R15]*/
+        addbyte(0x45);
+        addbyte(0x0f);
+        addbyte(0x6f);
+        addbyte(0x07 | (3 << 3));
+
+#if WIN64
+        addbyte(0x48); /*MOV RDI, RCX (voodoo_state)*/
+        addbyte(0x89);
+        addbyte(0xcf);
+        addbyte(0x49); /*MOV R15, RDX (voodoo_params)*/
+        addbyte(0x89);
+        addbyte(0xd7);
+        addbyte(0x4d); /*MOV R14, R9 (real_y)*/
+        addbyte(0x89);
+        addbyte(0xce);
+#else
+        addbyte(0x49); /*MOV R14, RCX (real_y)*/
+        addbyte(0x89);
+        addbyte(0xce);
+       addbyte(0x49); /*MOV R15, RSI (voodoo_state)*/
+       addbyte(0x89);
+       addbyte(0xf7);
+#endif
+
+        addbyte(0x49); /*MOV R9, logtable*/
+        addbyte(0xb8 | (9 & 7));
+        addquad((uint64_t)(uintptr_t)&logtable);
+        addbyte(0x49); /*MOV R10, alookup*/
+        addbyte(0xb8 | (10 & 7));
+        addquad((uint64_t)(uintptr_t)&alookup);
+        addbyte(0x49); /*MOV R11, aminuslookup*/
+        addbyte(0xb8 | (11 & 7));
+        addquad((uint64_t)(uintptr_t)&aminuslookup);
+        addbyte(0x49); /*MOV R12, xmm_00_ff_w*/
+        addbyte(0xb8 | (12 & 7));
+        addquad((uint64_t)(uintptr_t)&xmm_00_ff_w);
+        addbyte(0x49); /*MOV R13, i_00_ff_w*/
+        addbyte(0xb8 | (13 & 7));
+        addquad((uint64_t)(uintptr_t)&i_00_ff_w);
+
+        loop_jump_pos = block_pos;
+        addbyte(0x4c); /*MOV RSI, R15*/
+        addbyte(0x89);
+        addbyte(0xfe);
+        if (params->col_tiled || params->aux_tiled)
+        {
+                addbyte(0x8b); /*MOV EAX, state->x[EDI]*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, x));
+                addbyte(0x89); /*MOV EBX, EAX*/
+                addbyte(0xc3);
+                addbyte(0x83); /*AND EAX, 63*/
+                addbyte(0xe0);
+                addbyte(63);
+                addbyte(0xc1); /*SHR EBX, 6*/
+                addbyte(0xeb);
+                addbyte(6);
+                addbyte(0xc1); /*SHL EBX, 11  - tile is 128*32, << 12, div 2 because word index*/
+                addbyte(0xe3);
+                addbyte(11);
+                addbyte(0x01); /*ADD EAX, EBX*/
+                addbyte(0xd8);
+                addbyte(0x89); /*MOV state->x_tiled[EDI], EAX*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, x_tiled));
+        }
+        addbyte(0x66); /*PXOR XMM2, XMM2*/
+        addbyte(0x0f);
+        addbyte(0xef);
+        addbyte(0xd2);
+
+        if ((params->fbzMode & FBZ_W_BUFFER) || (params->fogMode & (FOG_ENABLE|FOG_CONSTANT|FOG_Z|FOG_ALPHA)) == FOG_ENABLE)
+        {
+                addbyte(0xb8); /*MOV new_depth, 0*/
+                addlong(0);
+                addbyte(0x66); /*TEST w+4, 0xffff*/
+                addbyte(0xf7);
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, w)+4);
+                addword(0xffff);
+                addbyte(0x75); /*JNZ got_depth*/
+                depth_jump_pos = block_pos;
+                addbyte(0);
+//                addbyte(4+5+2+3+2+5+5+3+2+2+2+/*3+*/3+2+6+4+5+2+3);
+                addbyte(0x8b); /*MOV EDX, w*/
+                addbyte(0x97);
+                addlong(offsetof(voodoo_state_t, w));
+                addbyte(0xb8); /*MOV new_depth, 0xf001*/
+                addlong(0xf001);
+                addbyte(0x89); /*MOV EBX, EDX*/
+                addbyte(0xd3);
+                addbyte(0xc1); /*SHR EDX, 16*/
+                addbyte(0xea);
+                addbyte(16);
+                addbyte(0x74); /*JZ got_depth*/
+                depth_jump_pos2 = block_pos;
+                addbyte(0);
+//                addbyte(5+5+3+2+2+2+/*3+*/3+2+6+4+5+2+3);
+                addbyte(0xb9); /*MOV ECX, 19*/
+                addlong(19);
+                addbyte(0x0f); /*BSR EAX, EDX*/
+                addbyte(0xbd);
+                addbyte(0xc2);
+                addbyte(0xba); /*MOV EDX, 15*/
+                addlong(15);
+                addbyte(0xf7); /*NOT EBX*/
+                addbyte(0xd3);
+                addbyte(0x29); /*SUB EDX, EAX - EDX = exp*/
+                addbyte(0xc2);
+                addbyte(0x29); /*SUB ECX, EDX*/
+                addbyte(0xd1);
+                addbyte(0xc1); /*SHL EDX, 12*/
+                addbyte(0xe2);
+                addbyte(12);
+                addbyte(0xd3); /*SHR EBX, CL*/
+                addbyte(0xeb);
+                addbyte(0x81); /*AND EBX, 0xfff - EBX = mant*/
+                addbyte(0xe3);
+                addlong(0xfff);
+                addbyte(0x67); /*LEA EAX, 1[EDX, EBX]*/
+                addbyte(0x8d);
+                addbyte(0x44);
+                addbyte(0x13);
+                addbyte(1);
+                addbyte(0xbb); /*MOV EBX, 0xffff*/
+                addlong(0xffff);
+                addbyte(0x39); /*CMP EAX, EBX*/
+                addbyte(0xd8);
+                addbyte(0x0f); /*CMOVA EAX, EBX*/
+                addbyte(0x47);
+                addbyte(0xc3);
+
+                if (depth_jump_pos)
+                        *(uint8_t *)&code_block[depth_jump_pos] = (block_pos - depth_jump_pos) - 1;
+                if (depth_jump_pos)
+                        *(uint8_t *)&code_block[depth_jump_pos2] = (block_pos - depth_jump_pos2) - 1;
+                
+                if ((params->fogMode & (FOG_ENABLE|FOG_CONSTANT|FOG_Z|FOG_ALPHA)) == FOG_ENABLE)
+                {
+                        addbyte(0x89); /*MOV state->w_depth[EDI], EAX*/
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, w_depth));
+                }
+        }
+        if (!(params->fbzMode & FBZ_W_BUFFER))
+        {
+                addbyte(0x8b); /*MOV EAX, z*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, z));
+                addbyte(0xbb); /*MOV EBX, 0xffff*/
+                addlong(0xffff);
+                addbyte(0x31); /*XOR ECX, ECX*/
+                addbyte(0xc9);
+                addbyte(0xc1); /*SAR EAX, 12*/
+                addbyte(0xf8);
+                addbyte(12);
+                addbyte(0x0f); /*CMOVS EAX, ECX*/
+                addbyte(0x48);
+                addbyte(0xc1);
+                addbyte(0x39); /*CMP EAX, EBX*/
+                addbyte(0xd8);
+                addbyte(0x0f); /*CMOVA EAX, EBX*/
+                addbyte(0x47);
+                addbyte(0xc3);
+        }
+
+        if (params->fbzMode & FBZ_DEPTH_BIAS)
+        {
+                addbyte(0x03); /*ADD EAX, params->zaColor[ESI]*/
+                addbyte(0x86);
+                addlong(offsetof(voodoo_params_t, zaColor));                
+                addbyte(0x25); /*AND EAX, 0xffff*/
+                addlong(0xffff);
+        }
+
+        addbyte(0x89); /*MOV state->new_depth[EDI], EAX*/
+        addbyte(0x87);
+        addlong(offsetof(voodoo_state_t, new_depth));
+
+        if ((params->fbzMode & FBZ_DEPTH_ENABLE) && (depthop != DEPTHOP_ALWAYS) && (depthop != DEPTHOP_NEVER))
+        {
+                addbyte(0x8b); /*MOV EBX, state->x[EDI]*/
+                addbyte(0x9f);
+                if (params->aux_tiled)
+                        addlong(offsetof(voodoo_state_t, x_tiled));
+                else
+                        addlong(offsetof(voodoo_state_t, x));
+                addbyte(0x48); /*MOV RCX, aux_mem[RDI]*/
+                addbyte(0x8b);
+                addbyte(0x8f);
+                addlong(offsetof(voodoo_state_t, aux_mem));
+                addbyte(0x0f); /*MOVZX EBX, [ECX+EBX*2]*/
+                addbyte(0xb7);
+                addbyte(0x1c);
+                addbyte(0x59);
+                if (params->fbzMode & FBZ_DEPTH_SOURCE)
+                {
+                        addbyte(0x0f); /*MOVZX EAX, zaColor[RSI]*/
+                        addbyte(0xb7);
+                        addbyte(0x86);
+                        addlong(offsetof(voodoo_params_t, zaColor));
+                }
+                addbyte(0x39); /*CMP EAX, EBX*/
+                addbyte(0xd8);
+                if (depthop == DEPTHOP_LESSTHAN)
+                {
+                        addbyte(0x0f); /*JAE skip*/
+                        addbyte(0x83);
+                        z_skip_pos = block_pos;
+                        addlong(0);
+                }
+                else if (depthop == DEPTHOP_EQUAL)
+                {
+                        addbyte(0x0f); /*JNE skip*/
+                        addbyte(0x85);
+                        z_skip_pos = block_pos;
+                        addlong(0);
+                }
+                else if (depthop == DEPTHOP_LESSTHANEQUAL)
+                {
+                        addbyte(0x0f); /*JA skip*/
+                        addbyte(0x87);
+                        z_skip_pos = block_pos;
+                        addlong(0);
+                }
+                else if (depthop == DEPTHOP_GREATERTHAN)
+                {
+                        addbyte(0x0f); /*JBE skip*/
+                        addbyte(0x86);
+                        z_skip_pos = block_pos;
+                        addlong(0);
+                }
+                else if (depthop == DEPTHOP_NOTEQUAL)
+                {
+                        addbyte(0x0f); /*JE skip*/
+                        addbyte(0x84);
+                        z_skip_pos = block_pos;
+                        addlong(0);
+                }
+                else if (depthop == DEPTHOP_GREATERTHANEQUAL)
+                {
+                        addbyte(0x0f); /*JB skip*/
+                        addbyte(0x82);
+                        z_skip_pos = block_pos;
+                        addlong(0);
+                }
+                else
+                        fatal("Bad depth_op\n");
+        }
+        else if ((params->fbzMode & FBZ_DEPTH_ENABLE) && (depthop == DEPTHOP_NEVER))
+        {
+                addbyte(0xC3); /*RET*/
+        }
+
+        /*XMM0 = colour*/
+        /*XMM2 = 0 (for unpacking*/
+        
+        /*EDI = state, ESI = params*/
+
+        if ((params->textureMode[0] & TEXTUREMODE_LOCAL_MASK) == TEXTUREMODE_LOCAL || !voodoo->dual_tmus)
+        {
+                /*TMU0 only sampling local colour or only one TMU, only sample TMU0*/
+                block_pos = codegen_texture_fetch(code_block, voodoo, params, state, block_pos, 0);
+                
+                addbyte(0x66); /*MOVD XMM0, EAX*/
+                addbyte(0x0f);
+                addbyte(0x6e);
+                addbyte(0xc0);
+                addbyte(0xc1); /*SHR EAX, 24*/
+                addbyte(0xe8);
+                addbyte(24);
+                addbyte(0x89); /*MOV state->tex_a[RDI], EAX*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, tex_a));
+        }
+        else if ((params->textureMode[0] & TEXTUREMODE_MASK) == TEXTUREMODE_PASSTHROUGH)
+        {
+                /*TMU0 in pass-through mode, only sample TMU1*/
+                block_pos = codegen_texture_fetch(code_block, voodoo, params, state, block_pos, 1);
+                
+                addbyte(0x66); /*MOVD XMM0, EAX*/
+                addbyte(0x0f);
+                addbyte(0x6e);
+                addbyte(0xc0);
+                addbyte(0xc1); /*SHR EAX, 24*/
+                addbyte(0xe8);
+                addbyte(24);
+                addbyte(0x89); /*MOV state->tex_a[RDI], EAX*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, tex_a));
+        }
+        else
+        {
+                block_pos = codegen_texture_fetch(code_block, voodoo, params, state, block_pos, 1);
+
+                addbyte(0x66); /*MOVD XMM3, EAX*/
+                addbyte(0x0f);
+                addbyte(0x6e);
+                addbyte(0xd8);
+                if ((params->textureMode[1] & TEXTUREMODE_TRILINEAR) && tc_sub_clocal_1)
+                {
+                        addbyte(0x8b); /*MOV EAX, state->lod*/
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, lod));
+                        if (!tc_reverse_blend_1)
+                        {
+                                addbyte(0xbb); /*MOV EBX, 1*/
+                                addlong(1);
+                        }
+                        else
+                        {
+                                addbyte(0x31); /*XOR EBX, EBX*/
+                                addbyte(0xdb);
+                        }
+                        addbyte(0x83); /*AND EAX, 1*/
+                        addbyte(0xe0);
+                        addbyte(1);
+                        if (!tca_reverse_blend_1)
+                        {
+                                addbyte(0xb9); /*MOV ECX, 1*/
+                                addlong(1);
+                        }
+                        else
+                        {
+                                addbyte(0x31); /*XOR ECX, ECX*/
+                                addbyte(0xc9);
+                        }
+                        addbyte(0x31); /*XOR EBX, EAX*/
+                        addbyte(0xc3);
+                        addbyte(0x31); /*XOR ECX, EAX*/
+                        addbyte(0xc1);
+                        addbyte(0xc1); /*SHL EBX, 4*/
+                        addbyte(0xe3);
+                        addbyte(4);
+                        /*EBX = tc_reverse_blend, ECX=tca_reverse_blend*/
+                }
+                addbyte(0x66); /*PUNPCKLBW XMM3, XMM2*/
+                addbyte(0x0f);
+                addbyte(0x60);
+                addbyte(0xda);
+                if (tc_sub_clocal_1)
+                {
+                        switch (tc_mselect_1)
+                        {
+                                case TC_MSELECT_ZERO:
+                                addbyte(0x66); /*PXOR XMM0, XMM0*/
+                                addbyte(0x0f);
+                                addbyte(0xef);
+                                addbyte(0xc0);
+                                break;
+                                case TC_MSELECT_CLOCAL:
+                                addbyte(0xf3); /*MOVQ XMM0, XMM3*/
+                                addbyte(0x0f);
+                                addbyte(0x7e);
+                                addbyte(0xc3);
+                                break;
+                                case TC_MSELECT_AOTHER:
+                                addbyte(0x66); /*PXOR XMM0, XMM0*/
+                                addbyte(0x0f);
+                                addbyte(0xef);
+                                addbyte(0xc0);
+                                break;
+                                case TC_MSELECT_ALOCAL:
+                                addbyte(0xf2); /*PSHUFLW XMM0, XMM3, 0xff*/
+                                addbyte(0x0f);
+                                addbyte(0x70);
+                                addbyte(0xc3);
+                                addbyte(0xff);
+                                break;
+                                case TC_MSELECT_DETAIL:
+                                addbyte(0xb8); /*MOV EAX, params->detail_bias[1]*/
+                                addlong(params->detail_bias[1]);
+                                addbyte(0x2b); /*SUB EAX, state->lod*/
+                                addbyte(0x87);
+                                addlong(offsetof(voodoo_state_t, lod));
+                                addbyte(0xba); /*MOV EDX, params->detail_max[1]*/
+                                addlong(params->detail_max[1]);
+                                addbyte(0xc1); /*SHL EAX, params->detail_scale[1]*/
+                                addbyte(0xe0);
+                                addbyte(params->detail_scale[1]);
+                                addbyte(0x39); /*CMP EAX, EDX*/
+                                addbyte(0xd0);
+                                addbyte(0x0f); /*CMOVNL EAX, EDX*/
+                                addbyte(0x4d);
+                                addbyte(0xc2);
+                                addbyte(0x66); /*MOVD XMM0, EAX*/
+                                addbyte(0x0f);
+                                addbyte(0x6e);
+                                addbyte(0xc0);
+                                addbyte(0xf2); /*PSHUFLW XMM0, XMM0, 0*/
+                                addbyte(0x0f);
+                                addbyte(0x70);
+                                addbyte(0xc0);
+                                addbyte(0);
+                                break;
+                                case TC_MSELECT_LOD_FRAC:
+                                addbyte(0x66); /*MOVD XMM0, state->lod_frac[1]*/
+                                addbyte(0x0f);
+                                addbyte(0x6e);
+                                addbyte(0x87);
+                                addlong(offsetof(voodoo_state_t, lod_frac[1]));
+                                addbyte(0xf2); /*PSHUFLW XMM0, XMM0, 0*/
+                                addbyte(0x0f);
+                                addbyte(0x70);
+                                addbyte(0xc0);
+                                addbyte(0);
+                                break;
+                        }
+                        if (params->textureMode[1] & TEXTUREMODE_TRILINEAR)
+                        {
+                                addbyte(0x66); /*PXOR XMM0, R12(xmm_00_ff_w)[EBX]*/
+                                addbyte(0x41);
+                                addbyte(0x0f);
+                                addbyte(0xef);
+                                addbyte(0x04);
+                                addbyte(0x1c);
+                        }
+                        else if (!tc_reverse_blend_1)
+                        {
+                                addbyte(0x66); /*PXOR XMM0, XMM9(xmm_ff_w)*/
+                                addbyte(0x41);
+                                addbyte(0x0f);
+                                addbyte(0xef);
+                                addbyte(0xc1);
+                        }
+                        addbyte(0x66); /*PADDW XMM0, XMM8(xmm_01_w)*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xc0);
+                        addbyte(0xf3); /*MOVQ XMM1, XMM2*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xca);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xe8);
+                        addbyte(0x66); /*PMULLW XMM0, XMM3*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0xc3);
+                        addbyte(0x66); /*PMULHW XMM5, XMM3*/
+                        addbyte(0x0f);
+                        addbyte(0xe5);
+                        addbyte(0xeb);
+                        addbyte(0x66); /*PUNPCKLWD XMM0, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0x61);
+                        addbyte(0xc5);
+                        addbyte(0x66); /*PSRAD XMM0, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x72);
+                        addbyte(0xe0);
+                        addbyte(8);
+                        addbyte(0x66); /*PACKSSDW XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x6b);
+                        addbyte(0xc0);
+                        addbyte(0x66); /*PSUBW XMM1, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0xf9);
+                        addbyte(0xc8);
+                        if (tc_add_clocal_1)
+                        {
+                                addbyte(0x66); /*PADDW XMM1, XMM3*/
+                                addbyte(0x0f);
+                                addbyte(0xfd);
+                                addbyte(0xcb);
+                        }
+                        else if (tc_add_alocal_1)
+                        {
+                                addbyte(0xf2); /*PSHUFLW XMM0, XMM3, 0xff*/
+                                addbyte(0x0f);
+                                addbyte(0x70);
+                                addbyte(0xc3);
+                                addbyte(0xff);
+                                addbyte(0x66); /*PADDW XMM1, XMM0*/
+                                addbyte(0x0f);
+                                addbyte(0xfd);
+                                addbyte(0xc8);
+                        }
+                        addbyte(0x66); /*PACKUSWB XMM3, XMM1*/
+                        addbyte(0x0f);
+                        addbyte(0x67);
+                        addbyte(0xd9);
+                        if (tca_sub_clocal_1)
+                        {
+                                addbyte(0x66); /*MOVD EBX, XMM3*/
+                                addbyte(0x0f);
+                                addbyte(0x7e);
+                                addbyte(0xdb);
+                        }
+                        addbyte(0x66); /*PUNPCKLBW XMM3, XMM2*/
+                        addbyte(0x0f);
+                        addbyte(0x60);
+                        addbyte(0xda);
+                }
+
+                if (tca_sub_clocal_1)
+                {
+                        addbyte(0xc1); /*SHR EBX, 24*/
+                        addbyte(0xeb);
+                        addbyte(24);
+                        switch (tca_mselect_1)
+                        {
+                                case TCA_MSELECT_ZERO:
+                                addbyte(0x31); /*XOR EAX, EAX*/
+                                addbyte(0xc0);
+                                break;
+                                case TCA_MSELECT_CLOCAL:
+                                addbyte(0x89); /*MOV EAX, EBX*/
+                                addbyte(0xd8);
+                                break;
+                                case TCA_MSELECT_AOTHER:
+                                addbyte(0x31); /*XOR EAX, EAX*/
+                                addbyte(0xc0);
+                                break;
+                                case TCA_MSELECT_ALOCAL:
+                                addbyte(0x89); /*MOV EAX, EBX*/
+                                addbyte(0xd8);
+                                break;
+                                case TCA_MSELECT_DETAIL:
+                                addbyte(0xb8); /*MOV EAX, params->detail_bias[1]*/
+                                addlong(params->detail_bias[1]);
+                                addbyte(0x2b); /*SUB EAX, state->lod*/
+                                addbyte(0x87);
+                                addlong(offsetof(voodoo_state_t, lod));
+                                addbyte(0xba); /*MOV EDX, params->detail_max[1]*/
+                                addlong(params->detail_max[1]);
+                                addbyte(0xc1); /*SHL EAX, params->detail_scale[1]*/
+                                addbyte(0xe0);
+                                addbyte(params->detail_scale[1]);
+                                addbyte(0x39); /*CMP EAX, EDX*/
+                                addbyte(0xd0);
+                                addbyte(0x0f); /*CMOVNL EAX, EDX*/
+                                addbyte(0x4d);
+                                addbyte(0xc2);
+                                break;
+                                case TCA_MSELECT_LOD_FRAC:
+                                addbyte(0x8b); /*MOV EAX, state->lod_frac[1]*/
+                                addbyte(0x87);
+                                addlong(offsetof(voodoo_state_t, lod_frac[1]));
+                                break;
+                        }
+                        if (params->textureMode[1] & TEXTUREMODE_TRILINEAR)
+                        {
+                                addbyte(0x41); /*XOR EAX, R13(i_00_ff_w)[ECX*4]*/
+                                addbyte(0x33);
+                                addbyte(0x44);
+                                addbyte(0x8d);
+                                addbyte(0);
+                        }
+                        else if (!tc_reverse_blend_1)
+                        {
+                                addbyte(0x35); /*XOR EAX, 0xff*/
+                                addlong(0xff);
+                        }
+                        addbyte(0x8e); /*ADD EAX, 1*/
+                        addbyte(0xc0);
+                        addbyte(1);
+                        addbyte(0x0f); /*IMUL EAX, EBX*/
+                        addbyte(0xaf);
+                        addbyte(0xc3);
+                        addbyte(0xb9); /*MOV ECX, 0xff*/
+                        addlong(0xff);
+                        addbyte(0xf7); /*NEG EAX*/
+                        addbyte(0xd8);
+                        addbyte(0xc1); /*SAR EAX, 8*/
+                        addbyte(0xf8);
+                        addbyte(8);
+                        if (tca_add_clocal_1 || tca_add_alocal_1)
+                        {
+                                addbyte(0x01); /*ADD EAX, EBX*/
+                                addbyte(0xd8);
+                        }
+                        addbyte(0x39); /*CMP ECX, EAX*/
+                        addbyte(0xc1);
+                        addbyte(0x0f); /*CMOVA ECX, EAX*/
+                        addbyte(0x47);
+                        addbyte(0xc8);
+                        addbyte(0x66); /*PINSRW 3, XMM3, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0xc4);
+                        addbyte(0xd8);
+                        addbyte(3);
+                }
+        
+                block_pos = codegen_texture_fetch(code_block, voodoo, params, state, block_pos, 0);
+
+                addbyte(0x66); /*MOVD XMM0, EAX*/
+                addbyte(0x0f);
+                addbyte(0x6e);
+                addbyte(0xc0);
+                addbyte(0x66); /*MOVD XMM7, EAX*/
+                addbyte(0x0f);
+                addbyte(0x6e);
+                addbyte(0xf8);
+        
+                if (params->textureMode[0] & TEXTUREMODE_TRILINEAR)
+                {
+                        addbyte(0x8b); /*MOV EAX, state->lod*/
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, lod));
+                        if (!tc_reverse_blend)
+                        {
+                                addbyte(0xbb); /*MOV EBX, 1*/
+                                addlong(1);
+                        }
+                        else
+                        {
+                                addbyte(0x31); /*XOR EBX, EBX*/
+                                addbyte(0xdb);
+                        }
+                        addbyte(0x83); /*AND EAX, 1*/
+                        addbyte(0xe0);
+                        addbyte(1);
+                        if (!tca_reverse_blend)
+                        {
+                                addbyte(0xb9); /*MOV ECX, 1*/
+                                addlong(1);
+                        }
+                        else
+                        {
+                                addbyte(0x31); /*XOR ECX, ECX*/
+                                addbyte(0xc9);
+                        }
+                        addbyte(0x31); /*XOR EBX, EAX*/
+                        addbyte(0xc3);
+                        addbyte(0x31); /*XOR ECX, EAX*/
+                        addbyte(0xc1);
+                        addbyte(0xc1); /*SHL EBX, 4*/
+                        addbyte(0xe3);
+                        addbyte(4);
+                        /*EBX = tc_reverse_blend, ECX=tca_reverse_blend*/
+                }
+
+                /*XMM0 = TMU0 output, XMM3 = TMU1 output*/
+
+                addbyte(0x66); /*PUNPCKLBW XMM0, XMM2*/
+                addbyte(0x0f);
+                addbyte(0x60);
+                addbyte(0xc2);
+                if (tc_zero_other)
+                {
+                        addbyte(0x66); /*PXOR XMM1, XMM1*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xc9);
+                }
+                else
+                {
+                        addbyte(0xf3); /*MOV XMM1, XMM3*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xcb);
+                }
+                if (tc_sub_clocal)
+                {
+                        addbyte(0x66); /*PSUBW XMM1, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0xf9);
+                        addbyte(0xc8);
+                }
+
+                switch (tc_mselect)
+                {
+                        case TC_MSELECT_ZERO:
+                        addbyte(0x66); /*PXOR XMM4, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xe4);
+                        break;
+                        case TC_MSELECT_CLOCAL:
+                        addbyte(0xf3); /*MOV XMM4, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xe0);
+                        break;
+                        case TC_MSELECT_AOTHER:
+                        addbyte(0xf2); /*PSHUFLW XMM4, XMM3, 3, 3, 3, 3*/
+                        addbyte(0x0f);
+                        addbyte(0x70);
+                        addbyte(0xe3);
+                        addbyte(0xff);
+                        break;
+                        case TC_MSELECT_ALOCAL:
+                        addbyte(0xf2); /*PSHUFLW XMM4, XMM0, 3, 3, 3, 3*/
+                        addbyte(0x0f);
+                        addbyte(0x70);
+                        addbyte(0xe0);
+                        addbyte(0xff);
+                        break;
+                        case TC_MSELECT_DETAIL:
+                        addbyte(0xb8); /*MOV EAX, params->detail_bias[0]*/
+                        addlong(params->detail_bias[0]);
+                        addbyte(0x2b); /*SUB EAX, state->lod*/
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, lod));
+                        addbyte(0xba); /*MOV EDX, params->detail_max[0]*/
+                        addlong(params->detail_max[0]);
+                        addbyte(0xc1); /*SHL EAX, params->detail_scale[0]*/
+                        addbyte(0xe0);
+                        addbyte(params->detail_scale[0]);
+                        addbyte(0x39); /*CMP EAX, EDX*/
+                        addbyte(0xd0);
+                        addbyte(0x0f); /*CMOVNL EAX, EDX*/
+                        addbyte(0x4d);
+                        addbyte(0xc2);
+                        addbyte(0x66); /*MOVD XMM4, EAX*/
+                        addbyte(0x0f);
+                        addbyte(0x6e);
+                        addbyte(0xe0);
+                        addbyte(0xf2); /*PSHUFLW XMM4, XMM4, 0*/
+                        addbyte(0x0f);
+                        addbyte(0x70);
+                        addbyte(0xe4);
+                        addbyte(0);
+                        break;
+                        case TC_MSELECT_LOD_FRAC:
+                        addbyte(0x66); /*MOVD XMM0, state->lod_frac[0]*/
+                        addbyte(0x0f);
+                        addbyte(0x6e);
+                        addbyte(0xa7);
+                        addlong(offsetof(voodoo_state_t, lod_frac[0]));
+                        addbyte(0xf2); /*PSHUFLW XMM0, XMM0, 0*/
+                        addbyte(0x0f);
+                        addbyte(0x70);
+                        addbyte(0xe4);
+                        addbyte(0);
+                        break;
+                }
+                if (params->textureMode[0] & TEXTUREMODE_TRILINEAR)
+                {
+                        addbyte(0x66); /*PXOR XMM4, R12(xmm_00_ff_w)[EBX]*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0x24);
+                        addbyte(0x1c);
+                }
+                else if (!tc_reverse_blend)
+                {
+                        addbyte(0x66); /*PXOR XMM4, XMM9(xmm_ff_w)*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xe1);
+                }
+                addbyte(0x66); /*PADDW XMM4, XMM8(xmm_01_w)*/
+                addbyte(0x41);
+                addbyte(0x0f);
+                addbyte(0xfd);
+                addbyte(0xe0);
+                addbyte(0xf3); /*MOVQ XMM5, XMM1*/
+                addbyte(0x0f);
+                addbyte(0x7e);
+                addbyte(0xe9);
+                addbyte(0x66); /*PMULLW XMM1, XMM4*/
+                addbyte(0x0f);
+                addbyte(0xd5);
+                addbyte(0xcc);
+
+                if (tca_sub_clocal)
+                {
+                        addbyte(0x66); /*MOV EBX, XMM7*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xfb);
+                }
+
+                addbyte(0x66); /*PMULHW XMM5, XMM4*/
+                addbyte(0x0f);
+                addbyte(0xe5);
+                addbyte(0xec);
+                addbyte(0x66); /*PUNPCKLWD XMM1, XMM5*/
+                addbyte(0x0f);
+                addbyte(0x61);
+                addbyte(0xcd);
+                addbyte(0x66); /*PSRAD XMM1, 8*/
+                addbyte(0x0f);
+                addbyte(0x72);
+                addbyte(0xe1);
+                addbyte(8);
+                addbyte(0x66); /*PACKSSDW XMM1, XMM1*/
+                addbyte(0x0f);
+                addbyte(0x6b);
+                addbyte(0xc9);
+
+                if (tca_sub_clocal)
+                {
+                        addbyte(0xc1); /*SHR EBX, 24*/
+                        addbyte(0xeb);
+                        addbyte(24);
+                }
+
+                if (tc_add_clocal)
+                {
+                        addbyte(0x66); /*PADDW XMM1, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xc8);
+                }
+                else if (tc_add_alocal)
+                {
+                        addbyte(0xf2); /*PSHUFLW XMM4, XMM0, 3, 3, 3, 3*/
+                        addbyte(0x0f);
+                        addbyte(0x70);
+                        addbyte(0xe0);
+                        addbyte(0xff);
+                        addbyte(0x66); /*PADDW XMM1, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0xfc);
+                        addbyte(0xcc);
+                }
+                if (tc_invert_output)
+                {
+                        addbyte(0x66); /*PXOR XMM1, XMM9(xmm_ff_w)*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xc9);
+                }
+        
+                addbyte(0x66); /*PACKUSWB XMM0, XMM0*/
+                addbyte(0x0f);
+                addbyte(0x67);
+                addbyte(0xc0);
+                addbyte(0x66); /*PACKUSWB XMM3, XMM3*/
+                addbyte(0x0f);
+                addbyte(0x67);
+                addbyte(0xdb);
+                addbyte(0x66); /*PACKUSWB XMM1, XMM1*/
+                addbyte(0x0f);
+                addbyte(0x67);
+                addbyte(0xc9);
+        
+                if (tca_zero_other)
+                {
+                        addbyte(0x31); /*XOR EAX, EAX*/
+                        addbyte(0xc0);
+                }
+                else
+                {
+                        addbyte(0x66); /*MOV EAX, XMM3*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xd8);
+                        addbyte(0xc1); /*SHR EAX, 24*/
+                        addbyte(0xe8);
+                        addbyte(24);
+                }
+                if (tca_sub_clocal)
+                {
+                        addbyte(0x29); /*SUB EAX, EBX*/
+                        addbyte(0xd8);
+                }
+                switch (tca_mselect)
+                {
+                        case TCA_MSELECT_ZERO:
+                        addbyte(0x31); /*XOR EBX, EBX*/
+                        addbyte(0xdb);
+                        break;
+                        case TCA_MSELECT_CLOCAL:
+                        addbyte(0x66); /*MOV EBX, XMM7*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xfb);
+                        addbyte(0xc1); /*SHR EBX, 24*/
+                        addbyte(0xeb);
+                        addbyte(24);
+                        break;
+                        case TCA_MSELECT_AOTHER:
+                        addbyte(0x66); /*MOV EBX, XMM3*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xdb);
+                        addbyte(0xc1); /*SHR EBX, 24*/
+                        addbyte(0xeb);
+                        addbyte(24);
+                        break;
+                        case TCA_MSELECT_ALOCAL:
+                        addbyte(0x66); /*MOV EBX, XMM7*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xfb);
+                        addbyte(0xc1); /*SHR EBX, 24*/
+                        addbyte(0xeb);
+                        addbyte(24);
+                        break;
+                        case TCA_MSELECT_DETAIL:
+                        addbyte(0xbb); /*MOV EBX, params->detail_bias[1]*/
+                        addlong(params->detail_bias[1]);
+                        addbyte(0x2b); /*SUB EBX, state->lod*/
+                        addbyte(0x9f);
+                        addlong(offsetof(voodoo_state_t, lod));
+                        addbyte(0xba); /*MOV EDX, params->detail_max[1]*/
+                        addlong(params->detail_max[1]);
+                        addbyte(0xc1); /*SHL EBX, params->detail_scale[1]*/
+                        addbyte(0xe3);
+                        addbyte(params->detail_scale[1]);
+                        addbyte(0x39); /*CMP EBX, EDX*/
+                        addbyte(0xd3);
+                        addbyte(0x0f); /*CMOVNL EBX, EDX*/
+                        addbyte(0x4d);
+                        addbyte(0xda);
+                        break;
+                        case TCA_MSELECT_LOD_FRAC:
+                        addbyte(0x8b); /*MOV EBX, state->lod_frac[0]*/
+                        addbyte(0x9f);
+                        addlong(offsetof(voodoo_state_t, lod_frac[0]));
+                        break;
+                }
+                if (params->textureMode[0] & TEXTUREMODE_TRILINEAR)
+                {
+                        addbyte(0x41); /*XOR EBX, R13(i_00_ff_w)[ECX*4]*/
+                        addbyte(0x33);
+                        addbyte(0x5c);
+                        addbyte(0x8d);
+                        addbyte(0);
+                }
+                else if (!tca_reverse_blend)
+                {
+                        addbyte(0x81); /*XOR EBX, 0xFF*/
+                        addbyte(0xf3);
+                        addlong(0xff);
+                }
+
+                addbyte(0x83); /*ADD EBX, 1*/
+                addbyte(0xc3);
+                addbyte(1);
+                addbyte(0x0f); /*IMUL EAX, EBX*/
+                addbyte(0xaf);
+                addbyte(0xc3);
+                addbyte(0x31); /*XOR EDX, EDX*/
+                addbyte(0xd2);
+                addbyte(0xc1); /*SAR EAX, 8*/
+                addbyte(0xf8);
+                addbyte(8);
+                if (tca_add_clocal || tca_add_alocal)
+                {
+                        addbyte(0x66); /*MOV EBX, XMM7*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xfb);
+                        addbyte(0xc1); /*SHR EBX, 24*/
+                        addbyte(0xeb);
+                        addbyte(24);
+                        addbyte(0x01); /*ADD EAX, EBX*/
+                        addbyte(0xd8);
+                }
+                addbyte(0x0f); /*CMOVS EAX, EDX*/
+                addbyte(0x48);
+                addbyte(0xc2);
+                addbyte(0xba); /*MOV EDX, 0xff*/
+                addlong(0xff);
+                addbyte(0x3d); /*CMP EAX, 0xff*/
+                addlong(0xff);
+                addbyte(0x0f); /*CMOVA EAX, EDX*/
+                addbyte(0x47);
+                addbyte(0xc2);
+                if (tca_invert_output)
+                {
+                        addbyte(0x35); /*XOR EAX, 0xff*/
+                        addlong(0xff);
+                }
+
+                addbyte(0x89); /*MOV state->tex_a[EDI], EAX*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, tex_a));
+
+                addbyte(0xf3); /*MOVQ XMM0, XMM1*/
+                addbyte(0x0f);
+                addbyte(0x7e);
+                addbyte(0xc1);
+        }
+        if (cc_mselect == CC_MSELECT_TEXRGB)
+        {
+                addbyte(0xf3); /*MOVD XMM4, XMM0*/
+                addbyte(0x0f);
+                addbyte(0x7e);
+                addbyte(0xe0);
+        }
+
+        if ((params->fbzMode & FBZ_CHROMAKEY))
+        {
+                switch (_rgb_sel)
+                {
+                        case CC_LOCALSELECT_ITER_RGB:
+                        addbyte(0xf3); /*MOVDQU XMM0, ib*/ /* ir, ig and ib must be in same dqword!*/
+                        addbyte(0x0f);
+                        addbyte(0x6f);
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, ib));
+                        addbyte(0x66); /*PSRAD XMM0, 12*/
+                        addbyte(0x0f);
+                        addbyte(0x72);
+                        addbyte(0xe0);
+                        addbyte(12);
+                        addbyte(0x66); /*PACKSSDW XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x6b);
+                        addbyte(0xc0);
+                        addbyte(0x66); /*PACKUSWB XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x67);
+                        addbyte(0xc0);
+                        addbyte(0x66); /*MOVD EAX, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xc0);
+                        break;
+                        case CC_LOCALSELECT_COLOR1:
+                        addbyte(0x8b); /*MOV EAX, params->color1[RSI]*/
+                        addbyte(0x86);
+                        addlong(offsetof(voodoo_params_t, color1));
+                        break;
+                        case CC_LOCALSELECT_TEX:
+                        addbyte(0x66); /*MOVD EAX, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xc0);
+                        break;
+                }
+                addbyte(0x8b); /*MOV EBX, params->chromaKey[ESI]*/
+                addbyte(0x9e);
+                addlong(offsetof(voodoo_params_t, chromaKey));
+                addbyte(0x31); /*XOR EBX, EAX*/
+                addbyte(0xc3);
+                addbyte(0x81); /*AND EBX, 0xffffff*/
+                addbyte(0xe3);
+                addlong(0xffffff);
+                addbyte(0x0f); /*JE skip*/
+                addbyte(0x84);
+                chroma_skip_pos = block_pos;
+                addlong(0);
+        }
+
+        if (voodoo->trexInit1[0] & (1 << 18))
+        {
+                addbyte(0xb8); /*MOV EAX, tmuConfig*/
+                addlong(voodoo->tmuConfig);
+                addbyte(0x66); /*MOVD XMM0, EAX*/
+                addbyte(0x0f);
+                addbyte(0x6e);
+                addbyte(0xc0);
+        }
+
+        if (params->alphaMode & ((1 << 0) | (1 << 4)))
+        {
+                /*EBX = a_other*/
+                switch (a_sel)
+                {
+                        case A_SEL_ITER_A:
+                        addbyte(0x8b); /*MOV EBX, state->ia*/
+                        addbyte(0x9f);
+                        addlong(offsetof(voodoo_state_t, ia));
+                        addbyte(0x31); /*XOR EAX, EAX*/
+                        addbyte(0xc0);
+                        addbyte(0xba); /*MOV EDX, 0xff*/
+                        addlong(0xff);
+                        addbyte(0xc1); /*SAR EBX, 12*/
+                        addbyte(0xfb);
+                        addbyte(12);
+                        addbyte(0x0f); /*CMOVS EBX, EAX*/
+                        addbyte(0x48);
+                        addbyte(0xd8);
+                        addbyte(0x39); /*CMP EBX, EDX*/
+                        addbyte(0xd3);
+                        addbyte(0x0f); /*CMOVA EBX, EDX*/
+                        addbyte(0x47);
+                        addbyte(0xda);
+                        break;
+                        case A_SEL_TEX:
+                        addbyte(0x8b); /*MOV EBX, state->tex_a*/
+                        addbyte(0x9f);
+                        addlong(offsetof(voodoo_state_t, tex_a));
+                        break;
+                        case A_SEL_COLOR1:
+                        addbyte(0x0f); /*MOVZX EBX, params->color1+3*/
+                        addbyte(0xb6);
+                        addbyte(0x9e);
+                        addlong(offsetof(voodoo_params_t, color1)+3);
+                        break;
+                        default:
+                        addbyte(0x31); /*XOR EBX, EBX*/
+                        addbyte(0xdb);
+                        break;
+                }
+                /*ECX = a_local*/
+                switch (cca_localselect)
+                {
+                        case CCA_LOCALSELECT_ITER_A:
+                        if (a_sel == A_SEL_ITER_A)
+                        {
+                                addbyte(0x89); /*MOV ECX, EBX*/
+                                addbyte(0xd9);
+                        }
+                        else
+                        {
+                                addbyte(0x8b); /*MOV ECX, state->ia*/
+                                addbyte(0x8f);
+                                addlong(offsetof(voodoo_state_t, ia));
+                                addbyte(0x31); /*XOR EAX, EAX*/
+                                addbyte(0xc0); 
+                                addbyte(0xba); /*MOV EDX, 0xff*/
+                                addlong(0xff);
+                                addbyte(0xc1);/*SAR ECX, 12*/
+                                addbyte(0xf9);
+                                addbyte(12);
+                                addbyte(0x0f); /*CMOVS ECX, EAX*/
+                                addbyte(0x48);
+                                addbyte(0xc8);
+                                addbyte(0x39); /*CMP ECX, EDX*/
+                                addbyte(0xd1);
+                                addbyte(0x0f); /*CMOVA ECX, EDX*/
+                                addbyte(0x47);
+                                addbyte(0xca);
+                        }
+                        break;
+                        case CCA_LOCALSELECT_COLOR0:
+                        addbyte(0x0f); /*MOVZX ECX, params->color0+3*/
+                        addbyte(0xb6);
+                        addbyte(0x8e);
+                        addlong(offsetof(voodoo_params_t, color0)+3);
+                        break;
+                        case CCA_LOCALSELECT_ITER_Z:
+                        addbyte(0x8b); /*MOV ECX, state->z*/
+                        addbyte(0x8f);
+                        addlong(offsetof(voodoo_state_t, z));
+                        if (a_sel != A_SEL_ITER_A)
+                        {
+                                addbyte(0x31); /*XOR EAX, EAX*/
+                                addbyte(0xc0); 
+                                addbyte(0xba); /*MOV EDX, 0xff*/
+                                addlong(0xff);
+                        }
+                        addbyte(0xc1);/*SAR ECX, 20*/
+                        addbyte(0xf9);
+                        addbyte(20);
+                        addbyte(0x0f); /*CMOVS ECX, EAX*/
+                        addbyte(0x48);
+                        addbyte(0xc8);
+                        addbyte(0x39); /*CMP ECX, EDX*/
+                        addbyte(0xd1);
+                        addbyte(0x0f); /*CMOVA ECX, EDX*/
+                        addbyte(0x47);
+                        addbyte(0xca);
+                        break;
+                                        
+                        default:
+                        addbyte(0xb9); /*MOV ECX, 0xff*/
+                        addlong(0xff);
+                        break;
+                }
+
+                if (cca_zero_other)
+                {
+                        addbyte(0x31); /*XOR EDX, EDX*/
+                        addbyte(0xd2);
+                }
+                else
+                {
+                        addbyte(0x89); /*MOV EDX, EBX*/
+                        addbyte(0xda);
+                }
+        
+                if (cca_sub_clocal)
+                {
+                        addbyte(0x29); /*SUB EDX, ECX*/
+                        addbyte(0xca);
+                }
+        }
+
+        if (cc_sub_clocal || cc_mselect == 1 || cc_add == 1)
+        {
+                /*XMM1 = local*/
+                if (!cc_localselect_override)
+                {
+                        if (cc_localselect)
+                        {
+                                addbyte(0x66); /*MOVD XMM1, params->color0*/
+                                addbyte(0x0f);
+                                addbyte(0x6e);
+                                addbyte(0x8e);
+                                addlong(offsetof(voodoo_params_t, color0));
+                        }
+                        else
+                        {
+                                addbyte(0xf3); /*MOVDQU XMM1, ib*/ /* ir, ig and ib must be in same dqword!*/
+                                addbyte(0x0f);
+                                addbyte(0x6f);
+                                addbyte(0x8f);
+                                addlong(offsetof(voodoo_state_t, ib));
+                                addbyte(0x66); /*PSRAD XMM1, 12*/
+                                addbyte(0x0f);
+                                addbyte(0x72);
+                                addbyte(0xe1);
+                                addbyte(12);
+                                addbyte(0x66); /*PACKSSDW XMM1, XMM1*/
+                                addbyte(0x0f);
+                                addbyte(0x6b);
+                                addbyte(0xc9);
+                                addbyte(0x66); /*PACKUSWB XMM1, XMM1*/
+                                addbyte(0x0f);
+                                addbyte(0x67);
+                                addbyte(0xc9);
+                        }
+                }
+                else
+                {
+                        addbyte(0xf6); /*TEST state->tex_a, 0x80*/
+                        addbyte(0x87);
+                        addbyte(0x23);
+                        addlong(offsetof(voodoo_state_t, tex_a));
+                        addbyte(0x80);
+                        addbyte(0x74);/*JZ !cc_localselect*/
+                        addbyte(8+2);
+                                addbyte(0x66); /*MOVD XMM1, params->color0*/
+                                addbyte(0x0f);
+                                addbyte(0x6e);
+                                addbyte(0x8e);
+                                addlong(offsetof(voodoo_params_t, color0));
+                                addbyte(0xeb); /*JMP +*/
+                                addbyte(8+5+4+4);
+                        /*!cc_localselect:*/
+                                addbyte(0xf3); /*MOVDQU XMM1, ib*/ /* ir, ig and ib must be in same dqword!*/
+                                addbyte(0x0f);
+                                addbyte(0x6f);
+                                addbyte(0x8f);
+                                addlong(offsetof(voodoo_state_t, ib));
+                                addbyte(0x66); /*PSRAD XMM1, 12*/
+                                addbyte(0x0f);
+                                addbyte(0x72);
+                                addbyte(0xe1);
+                                addbyte(12);
+                                addbyte(0x66); /*PACKSSDW XMM1, XMM1*/
+                                addbyte(0x0f);
+                                addbyte(0x6b);
+                                addbyte(0xc9);
+                                addbyte(0x66); /*PACKUSWB XMM1, XMM1*/
+                                addbyte(0x0f);
+                                addbyte(0x67);
+                                addbyte(0xc9);
+                }
+                addbyte(0x66); /*PUNPCKLBW XMM1, XMM2*/
+                addbyte(0x0f);
+                addbyte(0x60);
+                addbyte(0xca);
+        }
+        if (!cc_zero_other)
+        {
+                if (_rgb_sel == CC_LOCALSELECT_ITER_RGB)
+                {
+                        addbyte(0xf3); /*MOVDQU XMM0, ib*/ /* ir, ig and ib must be in same dqword!*/
+                        addbyte(0x0f);
+                        addbyte(0x6f);
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, ib));
+                        addbyte(0x66); /*PSRAD XMM0, 12*/
+                        addbyte(0x0f);
+                        addbyte(0x72);
+                        addbyte(0xe0);
+                        addbyte(12);
+                        addbyte(0x66); /*PACKSSDW XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x6b);
+                        addbyte(0xc0);
+                        addbyte(0x66); /*PACKUSWB XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x67);
+                        addbyte(0xc0);
+                }
+                else if (_rgb_sel == CC_LOCALSELECT_TEX)
+                {
+#if 0
+                        addbyte(0xf3); /*MOVDQU XMM0, state->tex_b*/
+                        addbyte(0x0f);
+                        addbyte(0x6f);
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, tex_b));
+                        addbyte(0x66); /*PACKSSDW XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x6b);
+                        addbyte(0xc0);
+                        addbyte(0x66); /*PACKUSWB XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x67);
+                        addbyte(0xc0);
+#endif
+                }
+                else if (_rgb_sel == CC_LOCALSELECT_COLOR1)
+                {
+                        addbyte(0x66); /*MOVD XMM0, params->color1*/
+                        addbyte(0x0f);
+                        addbyte(0x6e);
+                        addbyte(0x86);
+                        addlong(offsetof(voodoo_params_t, color1));
+                }
+                else
+                {
+                        /*MOVD XMM0, src_r*/
+                }
+                addbyte(0x66); /*PUNPCKLBW XMM0, XMM2*/
+                addbyte(0x0f);
+                addbyte(0x60);
+                addbyte(0xc2);
+                if (cc_sub_clocal)
+                {
+                        addbyte(0x66); /*PSUBW XMM0, XMM1*/
+                        addbyte(0x0f);
+                        addbyte(0xf9);
+                        addbyte(0xc1);
+                }
+        }
+        else
+        {
+                addbyte(0x66); /*PXOR XMM0, XMM0*/
+                addbyte(0x0f);
+                addbyte(0xef);
+                addbyte(0xc0);
+                if (cc_sub_clocal)
+                {
+                        addbyte(0x66); /*PSUBW XMM0, XMM1*/
+                        addbyte(0x0f);
+                        addbyte(0xf9);
+                        addbyte(0xc1);
+                }
+        }
+
+        if (params->alphaMode & ((1 << 0) | (1 << 4)))
+        {
+                if (!(cca_mselect == 0 && cca_reverse_blend == 0))
+                {
+                        switch (cca_mselect)
+                        {
+                                case CCA_MSELECT_ALOCAL:
+                                addbyte(0x89); /*MOV EAX, ECX*/
+                                addbyte(0xc8);
+                                break;
+                                case CCA_MSELECT_AOTHER:
+                                addbyte(0x89); /*MOV EAX, EBX*/
+                                addbyte(0xd8);
+                                break;
+                                case CCA_MSELECT_ALOCAL2:
+                                addbyte(0x89); /*MOV EAX, ECX*/
+                                addbyte(0xc8);
+                                break;
+                                case CCA_MSELECT_TEX:
+                                addbyte(0x0f); /*MOVZX EAX, state->tex_a*/
+                                addbyte(0xb6);
+                                addbyte(0x87);
+                                addlong(offsetof(voodoo_state_t, tex_a));
+                                break;
+
+                                case CCA_MSELECT_ZERO:
+                                default:
+                                addbyte(0x31); /*XOR EAX, EAX*/
+                                addbyte(0xc0);
+                                break;
+                        }
+                        if (!cca_reverse_blend)
+                        {
+                                addbyte(0x35); /*XOR EAX, 0xff*/
+                                addlong(0xff);
+                        }
+                        addbyte(0x83); /*ADD EAX, 1*/
+                        addbyte(0xc0);
+                        addbyte(1);
+                        addbyte(0x0f); /*IMUL EDX, EAX*/
+                        addbyte(0xaf);
+                        addbyte(0xd0);
+                        addbyte(0xc1); /*SHR EDX, 8*/
+                        addbyte(0xea);
+                        addbyte(8);
+                }
+        }
+
+        if ((params->alphaMode & ((1 << 0) | (1 << 4))))
+        {
+                addbyte(0x31); /*XOR EAX, EAX*/
+                addbyte(0xc0);
+        }
+        
+        if (!(cc_mselect == 0 && cc_reverse_blend == 0) && cc_mselect == CC_MSELECT_AOTHER)
+        {
+                /*Copy a_other to XMM3 before it gets modified*/
+                addbyte(0x66); /*MOVD XMM3, EDX*/
+                addbyte(0x0f);
+                addbyte(0x6e);
+                addbyte(0xda);
+                addbyte(0xf2); /*PSHUFLW XMM3, XMM3, 0*/
+                addbyte(0x0f);
+                addbyte(0x70);
+                addbyte(0xdb);
+                addbyte(0x00);
+        }
+        
+        if (cca_add && (params->alphaMode & ((1 << 0) | (1 << 4))))
+        {
+                addbyte(0x01); /*ADD EDX, ECX*/
+                addbyte(0xca);
+        }
+
+        if ((params->alphaMode & ((1 << 0) | (1 << 4))))
+        {
+                addbyte(0x85); /*TEST EDX, EDX*/
+                addbyte(0xd2);
+                addbyte(0x0f); /*CMOVS EDX, EAX*/
+                addbyte(0x48);
+                addbyte(0xd0);
+                addbyte(0xb8); /*MOV EAX, 0xff*/
+                addlong(0xff);
+                addbyte(0x81); /*CMP EDX, 0xff*/
+                addbyte(0xfa);
+                addlong(0xff);
+                addbyte(0x0f); /*CMOVA EDX, EAX*/
+                addbyte(0x47);
+                addbyte(0xd0);
+                if (cca_invert_output)
+                {
+                        addbyte(0x81); /*XOR EDX, 0xff*/
+                        addbyte(0xf2);
+                        addlong(0xff);
+                }
+        }
+
+        if (!(cc_mselect == 0 && cc_reverse_blend == 0))
+        {
+                switch (cc_mselect)
+                {
+                        case CC_MSELECT_ZERO:
+                        addbyte(0x66); /*PXOR XMM3, XMM3*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xdb);
+                        break;
+                        case CC_MSELECT_CLOCAL:
+                        addbyte(0xf3); /*MOV XMM3, XMM1*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xd9);
+                        break;
+                        case CC_MSELECT_ALOCAL:
+                        addbyte(0x66); /*MOVD XMM3, ECX*/
+                        addbyte(0x0f);
+                        addbyte(0x6e);
+                        addbyte(0xd9);
+                        addbyte(0xf2); /*PSHUFLW XMM3, XMM3, 0*/
+                        addbyte(0x0f);
+                        addbyte(0x70);
+                        addbyte(0xdb);
+                        addbyte(0x00);
+                        break;
+                        case CC_MSELECT_AOTHER:
+                        /*Handled above*/
+                        break;
+                        case CC_MSELECT_TEX:
+                        addbyte(0x66); /*PINSRW XMM3, state->tex_a, 0*/
+                        addbyte(0x0f);
+                        addbyte(0xc4);
+                        addbyte(0x9f);
+                        addlong(offsetof(voodoo_state_t, tex_a));
+                        addbyte(0);
+                        addbyte(0x66); /*PINSRW XMM3, state->tex_a, 1*/
+                        addbyte(0x0f);
+                        addbyte(0xc4);
+                        addbyte(0x9f);
+                        addlong(offsetof(voodoo_state_t, tex_a));
+                        addbyte(1);
+                        addbyte(0x66); /*PINSRW XMM3, state->tex_a, 2*/
+                        addbyte(0x0f);
+                        addbyte(0xc4);
+                        addbyte(0x9f);
+                        addlong(offsetof(voodoo_state_t, tex_a));
+                        addbyte(2);
+                        break;
+                        case CC_MSELECT_TEXRGB:
+                        addbyte(0x66); /*PUNPCKLBW XMM4, XMM2*/
+                        addbyte(0x0f);
+                        addbyte(0x60);
+                        addbyte(0xe2);
+                        addbyte(0xf3); /*MOVQ XMM3, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xdc);
+                        break;
+                        default:
+                        addbyte(0x66); /*PXOR XMM3, XMM3*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xdb);
+                        break;                                
+                }
+                addbyte(0xf3); /*MOV XMM4, XMM0*/
+                addbyte(0x0f);
+                addbyte(0x7e);
+                addbyte(0xe0);
+                if (!cc_reverse_blend)
+                {
+                        addbyte(0x66); /*PXOR XMM3, XMM9(xmm_ff_w)*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xd9);
+                }
+                addbyte(0x66); /*PADDW XMM3, XMM8(xmm_01_w)*/
+                addbyte(0x41);
+                addbyte(0x0f);
+                addbyte(0xfd);
+                addbyte(0xd8);
+                addbyte(0x66); /*PMULLW XMM0, XMM3*/
+                addbyte(0x0f);
+                addbyte(0xd5);
+                addbyte(0xc3);
+                addbyte(0x66); /*PMULHW XMM4, XMM3*/
+                addbyte(0x0f);
+                addbyte(0xe5);
+                addbyte(0xe3);
+                addbyte(0x66); /*PUNPCKLWD XMM0, XMM4*/
+                addbyte(0x0f);
+                addbyte(0x61);
+                addbyte(0xc4);
+                addbyte(0x66); /*PSRLD XMM0, 8*/
+                addbyte(0x0f);
+                addbyte(0x72);
+                addbyte(0xe0);
+                addbyte(8);
+                addbyte(0x66); /*PACKSSDW XMM0, XMM0*/
+                addbyte(0x0f);
+                addbyte(0x6b);
+                addbyte(0xc0);
+        }
+        
+        if (cc_add == 1)
+        {
+                addbyte(0x66); /*PADDW XMM0, XMM1*/
+                addbyte(0x0f);
+                addbyte(0xfd);
+                addbyte(0xc1);
+        }
+
+        addbyte(0x66); /*PACKUSWB XMM0, XMM0*/
+        addbyte(0x0f);
+        addbyte(0x67);
+        addbyte(0xc0);
+
+        if (cc_invert_output)
+        {
+                addbyte(0x66); /*PXOR XMM0, XMM10(xmm_ff_b)*/
+                addbyte(0x41);
+                addbyte(0x0f);
+                addbyte(0xef);
+                addbyte(0xc2);
+        }
+
+        if (params->fogMode & FOG_ENABLE)
+        {
+                if (params->fogMode & FOG_CONSTANT)                     
+                {                                                       
+                        addbyte(0x66); /*MOVD XMM3, params->fogColor[ESI]*/
+                        addbyte(0x0f);
+                        addbyte(0x6e);
+                        addbyte(0x9e);
+                        addlong(offsetof(voodoo_params_t, fogColor));
+                        addbyte(0x66); /*PADDUSB XMM0, XMM3*/
+                        addbyte(0x0f);
+                        addbyte(0xdc);
+                        addbyte(0xc3);
+                }                                                       
+                else                                                    
+                {                                                       
+                        addbyte(0x66); /*PUNPCKLBW XMM0, XMM2*/
+                        addbyte(0x0f);
+                        addbyte(0x60);
+                        addbyte(0xc2);
+
+                        if (!(params->fogMode & FOG_ADD))               
+                        {
+                                addbyte(0x66); /*MOVD XMM3, params->fogColor[ESI]*/
+                                addbyte(0x0f);
+                                addbyte(0x6e);
+                                addbyte(0x9e);
+                                addlong(offsetof(voodoo_params_t, fogColor));
+                                addbyte(0x66); /*PUNPCKLBW XMM3, XMM2*/
+                                addbyte(0x0f);
+                                addbyte(0x60);
+                                addbyte(0xda);
+                        }                                               
+                        else
+                        {
+                                addbyte(0x66); /*PXOR XMM3, XMM3*/
+                                addbyte(0x0f);
+                                addbyte(0xef);
+                                addbyte(0xdb);
+                        }
+                                                                        
+                        if (!(params->fogMode & FOG_MULT))
+                        {
+                                addbyte(0x66); /*PSUBW XMM3, XMM0*/
+                                addbyte(0x0f);
+                                addbyte(0xf9);
+                                addbyte(0xd8);
+                        }
+
+                        /*Divide by 2 to prevent overflow on multiply*/
+                        addbyte(0x66); /*PSRAW XMM3, 1*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xe3);
+                        addbyte(1);
+
+                        switch (params->fogMode & (FOG_Z|FOG_ALPHA))
+                        {
+                                case 0:
+                                addbyte(0x8b); /*MOV EBX, state->w_depth[EDI]*/
+                                addbyte(0x9f);
+                                addlong(offsetof(voodoo_state_t, w_depth));
+                                addbyte(0x89); /*MOV EAX, EBX*/
+                                addbyte(0xd8);
+                                addbyte(0xc1); /*SHR EBX, 10*/
+                                addbyte(0xeb);
+                                addbyte(10);
+                                addbyte(0xc1); /*SHR EAX, 2*/
+                                addbyte(0xe8);
+                                addbyte(2);
+                                addbyte(0x83); /*AND EBX, 0x3f*/
+                                addbyte(0xe3);
+                                addbyte(0x3f);
+                                addbyte(0x25); /*AND EAX, 0xff*/
+                                addlong(0xff);
+                                addbyte(0xf6); /*MUL params->fogTable+1[ESI+EBX*2]*/
+                                addbyte(0xa4);
+                                addbyte(0x5e);
+                                addlong(offsetof(voodoo_params_t, fogTable)+1);
+                                addbyte(0x0f); /*MOVZX EBX, params->fogTable[ESI+EBX*2]*/
+                                addbyte(0xb6);
+                                addbyte(0x9c);
+                                addbyte(0x5e);
+                                addlong(offsetof(voodoo_params_t, fogTable));
+                                addbyte(0xc1); /*SHR EAX, 10*/
+                                addbyte(0xe8);
+                                addbyte(10);
+                                addbyte(0x01); /*ADD EAX, EBX*/
+                                addbyte(0xd8);
+/*                                int fog_idx = (w_depth >> 10) & 0x3f;
+
+                                fog_a = params->fogTable[fog_idx].fog;
+                                fog_a += (params->fogTable[fog_idx].dfog * ((w_depth >> 2) & 0xff)) >> 10;*/
+                                break;
+                                
+                                case FOG_Z:
+                                addbyte(0x8b); /*MOV EAX, state->z[EDI]*/
+                                addbyte(0x87);
+                                addlong(offsetof(voodoo_state_t, z));
+                                addbyte(0xc1); /*SHR EAX, 12*/
+                                addbyte(0xe8);
+                                addbyte(12);
+                                addbyte(0x25); /*AND EAX, 0xff*/
+                                addlong(0xff);
+//                                fog_a = (z >> 20) & 0xff;
+                                break;
+                                
+                                case FOG_ALPHA:
+                                addbyte(0x8b); /*MOV EAX, state->ia[EDI]*/
+                                addbyte(0x87);
+                                addlong(offsetof(voodoo_state_t, ia));
+                                addbyte(0x31); /*XOR EBX, EBX*/
+                                addbyte(0xdb);
+                                addbyte(0xc1); /*SAR EAX, 12*/
+                                addbyte(0xf8);
+                                addbyte(12);
+                                addbyte(0x0f); /*CMOVS EAX, EBX*/
+                                addbyte(0x48);
+                                addbyte(0xc3);
+                                addbyte(0xbb); /*MOV EBX, 0xff*/
+                                addlong(0xff);
+                                addbyte(0x3d); /*CMP EAX, 0xff*/
+                                addlong(0xff);
+                                addbyte(0x0f); /*CMOVAE EAX, EBX*/
+                                addbyte(0x43);
+                                addbyte(0xc3);
+//                                fog_a = CLAMP(ia >> 12);
+                                break;
+                                
+                                case FOG_W:
+                                addbyte(0x8b); /*MOV EAX, state->w[EDI]+4*/
+                                addbyte(0x87);
+                                addlong(offsetof(voodoo_state_t, w)+4);
+                                addbyte(0x31); /*XOR EBX, EBX*/
+                                addbyte(0xdb);
+                                addbyte(0x09); /*OR EAX, EAX*/
+                                addbyte(0xc0);
+                                addbyte(0x0f); /*CMOVS EAX, EBX*/
+                                addbyte(0x48);
+                                addbyte(0xc3);
+                                addbyte(0xbb); /*MOV EBX, 0xff*/
+                                addlong(0xff);
+                                addbyte(0x3d); /*CMP EAX, 0xff*/
+                                addlong(0xff);
+                                addbyte(0x0f); /*CMOVAE EAX, EBX*/
+                                addbyte(0x43);
+                                addbyte(0xc3);
+//                                fog_a = CLAMP(w >> 32);
+                                break;
+                        }
+                        addbyte(0x01); /*ADD EAX, EAX*/
+                        addbyte(0xc0);
+
+                        addbyte(0x66); /*PMULLW XMM3, alookup+4[EAX*8]*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0x5c);
+                        addbyte(0xc2);
+                        addbyte(16);
+                        addbyte(0x66); /*PSRAW XMM3, 7*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xe3);
+                        addbyte(7);
+
+                        if (params->fogMode & FOG_MULT)
+                        {
+                                addbyte(0xf3); /*MOV XMM0, XMM3*/
+                                addbyte(0x0f);
+                                addbyte(0x7e);
+                                addbyte(0xc3);
+                        }
+                        else
+                        {
+                                addbyte(0x66); /*PADDW XMM0, XMM3*/
+                                addbyte(0x0f);
+                                addbyte(0xfd);
+                                addbyte(0xc3);
+                        }
+                        addbyte(0x66); /*PACKUSWB XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x67);
+                        addbyte(0xc0);
+                }
+        }
+
+        if ((params->alphaMode & 1) && (alpha_func != AFUNC_NEVER) && (alpha_func != AFUNC_ALWAYS))
+        {
+                addbyte(0x0f); /*MOVZX ECX, params->alphaMode+3*/
+                addbyte(0xb6);
+                addbyte(0x8e);
+                addlong(offsetof(voodoo_params_t, alphaMode) + 3);
+                addbyte(0x39); /*CMP EDX, ECX*/
+                addbyte(0xca);
+
+                switch (alpha_func)
+                {
+                        case AFUNC_LESSTHAN:
+                        addbyte(0x0f); /*JAE skip*/
+                        addbyte(0x83);
+                        a_skip_pos = block_pos;
+                        addlong(0);
+                        break;
+                        case AFUNC_EQUAL:
+                        addbyte(0x0f); /*JNE skip*/
+                        addbyte(0x85);
+                        a_skip_pos = block_pos;
+                        addlong(0);
+                        break;
+                        case AFUNC_LESSTHANEQUAL:
+                        addbyte(0x0f); /*JA skip*/
+                        addbyte(0x87);
+                        a_skip_pos = block_pos;
+                        addlong(0);
+                        break;
+                        case AFUNC_GREATERTHAN:
+                        addbyte(0x0f); /*JBE skip*/
+                        addbyte(0x86);
+                        a_skip_pos = block_pos;
+                        addlong(0);
+                        break;
+                        case AFUNC_NOTEQUAL:
+                        addbyte(0x0f); /*JE skip*/
+                        addbyte(0x84);
+                        a_skip_pos = block_pos;
+                        addlong(0);
+                        break;
+                        case AFUNC_GREATERTHANEQUAL:
+                        addbyte(0x0f); /*JB skip*/
+                        addbyte(0x82);
+                        a_skip_pos = block_pos;
+                        addlong(0);
+                        break;
+                }
+        }
+        else if ((params->alphaMode & 1) && (alpha_func == AFUNC_NEVER))
+        {
+                addbyte(0xC3); /*RET*/
+        }
+        
+        if (params->alphaMode & (1 << 4))
+        {
+                addbyte(0x49); /*MOV R8, rgb565*/
+                addbyte(0xb8);
+                addquad((uintptr_t)rgb565);
+                addbyte(0x8b); /*MOV EAX, state->x[EDI]*/
+                addbyte(0x87);
+                if (params->col_tiled)
+                        addlong(offsetof(voodoo_state_t, x_tiled));
+                else
+                        addlong(offsetof(voodoo_state_t, x));
+                addbyte(0x48); /*MOV RBP, fb_mem*/
+                addbyte(0x8b);
+                addbyte(0xaf);
+                addlong(offsetof(voodoo_state_t, fb_mem));
+                addbyte(0x01); /*ADD EDX, EDX*/
+                addbyte(0xd2);
+                addbyte(0x0f); /*MOVZX EAX, [RBP+RAX*2]*/
+                addbyte(0xb7);
+                addbyte(0x44);
+                addbyte(0x45);
+                addbyte(0);
+                addbyte(0x66); /*PUNPCKLBW XMM0, XMM2*/
+                addbyte(0x0f);
+                addbyte(0x60);
+                addbyte(0xc2);
+                addbyte(0x66); /*MOVD XMM4, rgb565[EAX*4]*/
+                addbyte(0x41);
+                addbyte(0x0f);
+                addbyte(0x6e);
+                addbyte(0x24);
+                addbyte(0x80);
+                addbyte(0x66); /*PUNPCKLBW XMM4, XMM2*/
+                addbyte(0x0f);
+                addbyte(0x60);
+                addbyte(0xe2);
+                addbyte(0xf3); /*MOV XMM6, XMM4*/
+                addbyte(0x0f);
+                addbyte(0x7e);
+                addbyte(0xf4);
+                
+                switch (dest_afunc)
+                {
+                        case AFUNC_AZERO:
+                        addbyte(0x66); /*PXOR XMM4, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xe4);
+                        break;
+                        case AFUNC_ASRC_ALPHA:
+                        addbyte(0x66); /*PMULLW XMM4, R10(alookup)[EDX*8]*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0x24);
+                        addbyte(0xd2);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xec);
+                        addbyte(0x66); /*PADDW XMM4, R10(alookup)[1*8]*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0x62);
+                        addbyte(8*2);
+                        addbyte(0x66); /*PSRLW XMM5, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd5);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM4, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xe5);
+                        addbyte(0x66); /*PSRLW XMM4, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd4);
+                        addbyte(8);
+                        break;
+                        case AFUNC_A_COLOR:
+                        addbyte(0x66); /*PMULLW XMM4, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0xe0);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xec);
+                        addbyte(0x66); /*PADDW XMM4, R10(alookup)[1*8]*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0x62);
+                        addbyte(8*2);
+                        addbyte(0x66); /*PSRLW XMM5, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd5);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM4, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xe5);
+                        addbyte(0x66); /*PSRLW XMM4, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd4);
+                        addbyte(8);
+                        break;
+                        case AFUNC_ADST_ALPHA:
+                        break;
+                        case AFUNC_AONE:
+                        break;
+                        case AFUNC_AOMSRC_ALPHA:
+                        addbyte(0x66); /*PMULLW XMM4, R11(aminuslookup)[EDX*8]*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0x24);
+                        addbyte(0xd3);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xec);
+                        addbyte(0x66); /*PADDW XMM4, R10(alookup)[1*8]*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0x62);
+                        addbyte(8*2);
+                        addbyte(0x66); /*PSRLW XMM5, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd5);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM4, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xe5);
+                        addbyte(0x66); /*PSRLW XMM4, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd4);
+                        addbyte(8);
+                        break;
+                        case AFUNC_AOM_COLOR:
+                        addbyte(0xf3); /*MOVQ XMM5, XMM9(xmm_ff_w)*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xe9);
+                        addbyte(0x66); /*PSUBW XMM5, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0xf9);
+                        addbyte(0xe8);
+                        addbyte(0x66); /*PMULLW XMM4, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0xe5);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xec);
+                        addbyte(0x66); /*PADDW XMM4, alookup[1*8]*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0x62);
+                        addbyte(8*2);
+                        addbyte(0x66); /*PSRLW XMM5, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd5);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM4, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xe5);
+                        addbyte(0x66); /*PSRLW XMM4, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd4);
+                        addbyte(8);
+                        break;
+                        case AFUNC_AOMDST_ALPHA:
+                        addbyte(0x66); /*PXOR XMM4, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xe4);
+                        break;
+                        case AFUNC_ASATURATE:
+                        addbyte(0x66); /*PMULLW XMM4, XMM11(minus_254)*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0xe3);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xec);
+                        addbyte(0x66); /*PADDW XMM4, alookup[1*8]*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0x62);
+                        addbyte(8*2);
+                        addbyte(0x66); /*PSRLW XMM5, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd5);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM4, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xe5);
+                        addbyte(0x66); /*PSRLW XMM4, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd4);
+                        addbyte(8);
+                }
+
+                switch (src_afunc)
+                {
+                        case AFUNC_AZERO:
+                        addbyte(0x66); /*PXOR XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xc0);
+                        break;
+                        case AFUNC_ASRC_ALPHA:
+                        addbyte(0x66); /*PMULLW XMM0, R10(alookup)[EDX*8]*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0x04);
+                        addbyte(0xd2);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xe8);
+                        addbyte(0x66); /*PADDW XMM0, R10(alookup)[1*8]*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0x42);
+                        addbyte(8*2);
+                        addbyte(0x66); /*PSRLW XMM5, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd5);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM0, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xc5);
+                        addbyte(0x66); /*PSRLW XMM0, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd0);
+                        addbyte(8);
+                        break;
+                        case AFUNC_A_COLOR:
+                        addbyte(0x66); /*PMULLW XMM0, XMM6*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0xc6);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xe8);
+                        addbyte(0x66); /*PADDW XMM0, R10(alookup)[1*8]*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0x42);
+                        addbyte(8*2);
+                        addbyte(0x66); /*PSRLW XMM5, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd5);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM0, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xc5);
+                        addbyte(0x66); /*PSRLW XMM0, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd0);
+                        addbyte(8);
+                        break;
+                        case AFUNC_ADST_ALPHA:
+                        break;
+                        case AFUNC_AONE:
+                        break;
+                        case AFUNC_AOMSRC_ALPHA:
+                        addbyte(0x66); /*PMULLW XMM0, R11(aminuslookup)[EDX*8]*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0x04);
+                        addbyte(0xd3);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xe8);
+                        addbyte(0x66); /*PADDW XMM0, alookup[1*8]*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0x42);
+                        addbyte(8*2);
+                        addbyte(0x66); /*PSRLW XMM5, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd5);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM0, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xc5);
+                        addbyte(0x66); /*PSRLW XMM0, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd0);
+                        addbyte(8);
+                        break;
+                        case AFUNC_AOM_COLOR:
+                        addbyte(0xf3); /*MOVQ XMM5, XMM9(xmm_ff_w)*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xe9);
+                        addbyte(0x66); /*PSUBW XMM5, XMM6*/
+                        addbyte(0x0f);
+                        addbyte(0xf9);
+                        addbyte(0xee);
+                        addbyte(0x66); /*PMULLW XMM0, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0xc5);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xe8);
+                        addbyte(0x66); /*PADDW XMM0, alookup[1*8]*/
+                        addbyte(0x41);
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0x42);
+                        addbyte(8*2);
+                        addbyte(0x66); /*PSRLW XMM5, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd5);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM0, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xc5);
+                        addbyte(0x66); /*PSRLW XMM0, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd0);
+                        addbyte(8);
+                        break;
+                        case AFUNC_AOMDST_ALPHA:
+                        addbyte(0x66); /*PXOR XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xc0);
+                        break;
+                        case AFUNC_ACOLORBEFOREFOG:
+                        break;
+                }
+                
+                addbyte(0x66); /*PADDW XMM0, XMM4*/
+                addbyte(0x0f);
+                addbyte(0xfd);
+                addbyte(0xc4);
+
+                addbyte(0x66); /*PACKUSWB XMM0, XMM0*/
+                addbyte(0x0f);
+                addbyte(0x67);
+                addbyte(0xc0);
+        }
+
+        addbyte(0x8b); /*MOV EDX, state->x[EDI]*/
+        addbyte(0x97);
+        if (params->col_tiled)
+                addlong(offsetof(voodoo_state_t, x_tiled));
+        else
+                addlong(offsetof(voodoo_state_t, x));
+
+        addbyte(0x66); /*MOV EAX, XMM0*/
+        addbyte(0x0f);
+        addbyte(0x7e);
+        addbyte(0xc0);
+        
+        if (params->fbzMode & FBZ_RGB_WMASK)
+        {
+                if (dither)
+                {
+                        addbyte(0x49); /*MOV R8, dither_rb*/
+                        addbyte(0xb8);
+                        addquad(dither2x2 ? (uintptr_t)dither_rb2x2 : (uintptr_t)dither_rb);
+                        addbyte(0x4c); /*MOV ESI, real_y (R14)*/
+                        addbyte(0x89);
+                        addbyte(0xf6);
+                        addbyte(0x0f); /*MOVZX EBX, AH*/ /*G*/
+                        addbyte(0xb6);
+                        addbyte(0xdc);
+                        if (dither2x2)
+                        {
+                                addbyte(0x83); /*AND EDX, 1*/
+                                addbyte(0xe2);
+                                addbyte(1);
+                                addbyte(0x83); /*AND ESI, 1*/
+                                addbyte(0xe6);
+                                addbyte(1);
+                                addbyte(0xc1); /*SHL EBX, 2*/
+                                addbyte(0xe3);
+                                addbyte(2);
+                        }
+                        else
+                        {
+                                addbyte(0x83); /*AND EDX, 3*/
+                                addbyte(0xe2);
+                                addbyte(3);
+                                addbyte(0x83); /*AND ESI, 3*/
+                                addbyte(0xe6);
+                                addbyte(3);
+                                addbyte(0xc1); /*SHL EBX, 4*/
+                                addbyte(0xe3);
+                                addbyte(4);
+                        }
+                        addbyte(0x0f); /*MOVZX ECX, AL*/ /*R*/
+                        addbyte(0xb6);
+                        addbyte(0xc8);
+                        if (dither2x2)
+                        {
+                                addbyte(0xc1); /*SHR EAX, 14*/
+                                addbyte(0xe8);
+                                addbyte(14);
+                                addbyte(0x8d); /*LEA ESI, RDX+RSI*2*/
+                                addbyte(0x34);
+                                addbyte(0x72);
+                        }
+                        else
+                        {
+                                addbyte(0xc1); /*SHR EAX, 12*/
+                                addbyte(0xe8);
+                                addbyte(12);
+                                addbyte(0x8d); /*LEA ESI, RDX+RSI*4*/
+                                addbyte(0x34);
+                                addbyte(0xb2);
+                        }
+                        addbyte(0x8b); /*MOV EDX, state->x[EDI]*/
+                        addbyte(0x97);
+                        if (params->col_tiled)
+                                addlong(offsetof(voodoo_state_t, x_tiled));
+                        else
+                                addlong(offsetof(voodoo_state_t, x));
+                        addbyte(0x4c); /*ADD RSI, R8*/
+                        addbyte(0x01);
+                        addbyte(0xc6);
+                        if (dither2x2)
+                        {
+                                addbyte(0xc1); /*SHL ECX, 2*/
+                                addbyte(0xe1);
+                                addbyte(2);
+                                addbyte(0x25); /*AND EAX, 0x3fc*/ /*B*/
+                                addlong(0x3fc);
+                        }
+                        else
+                        {
+                                addbyte(0xc1); /*SHL ECX, 4*/
+                                addbyte(0xe1);
+                                addbyte(4);
+                                addbyte(0x25); /*AND EAX, 0xff0*/ /*B*/
+                                addlong(0xff0);
+                        }
+                        addbyte(0x0f); /*MOVZX EBX, dither_g[EBX+ESI]*/
+                        addbyte(0xb6);
+                        addbyte(0x9c);
+                        addbyte(0x1e);
+                        addlong(dither2x2 ? ((uintptr_t)dither_g2x2 - (uintptr_t)dither_rb2x2) : ((uintptr_t)dither_g - (uintptr_t)dither_rb));
+                        addbyte(0x0f); /*MOVZX ECX, dither_rb[RCX+RSI]*/
+                        addbyte(0xb6);
+                        addbyte(0x0c);
+                        addbyte(0x0e);
+                        addbyte(0x0f); /*MOVZX EAX, dither_rb[RAX+RSI]*/
+                        addbyte(0xb6);
+                        addbyte(0x04);
+                        addbyte(0x06);
+                        addbyte(0xc1); /*SHL EBX, 5*/
+                        addbyte(0xe3);
+                        addbyte(5);
+                        addbyte(0xc1); /*SHL EAX, 11*/
+                        addbyte(0xe0);
+                        addbyte(11);
+                        addbyte(0x09); /*OR EAX, EBX*/
+                        addbyte(0xd8);
+                        addbyte(0x09); /*OR EAX, ECX*/
+                        addbyte(0xc8);
+                }
+                else
+                {
+                        addbyte(0x89); /*MOV EBX, EAX*/
+                        addbyte(0xc3);
+                        addbyte(0x0f); /*MOVZX ECX, AH*/
+                        addbyte(0xb6);
+                        addbyte(0xcc);
+                        addbyte(0xc1); /*SHR EAX, 3*/
+                        addbyte(0xe8);
+                        addbyte(3);
+                        addbyte(0xc1); /*SHR EBX, 8*/
+                        addbyte(0xeb);
+                        addbyte(8);
+                        addbyte(0xc1); /*SHL ECX, 3*/
+                        addbyte(0xe1);
+                        addbyte(3);
+                        addbyte(0x81); /*AND EAX, 0x001f*/
+                        addbyte(0xe0);
+                        addlong(0x001f);
+                        addbyte(0x81); /*AND EBX, 0xf800*/
+                        addbyte(0xe3);
+                        addlong(0xf800);
+                        addbyte(0x81); /*AND ECX, 0x07e0*/
+                        addbyte(0xe1);
+                        addlong(0x07e0);
+                        addbyte(0x09); /*OR EAX, EBX*/
+                        addbyte(0xd8);
+                        addbyte(0x09); /*OR EAX, ECX*/
+                        addbyte(0xc8);
+                }
+                addbyte(0x48); /*MOV RSI, fb_mem*/
+                addbyte(0x8b);
+                addbyte(0xb7);
+                addlong(offsetof(voodoo_state_t, fb_mem));
+                addbyte(0x66); /*MOV [ESI+EDX*2], AX*/
+                addbyte(0x89);
+                addbyte(0x04);
+                addbyte(0x56);
+        }
+
+        if ((params->fbzMode & (FBZ_DEPTH_WMASK | FBZ_DEPTH_ENABLE)) == (FBZ_DEPTH_WMASK | FBZ_DEPTH_ENABLE))
+        {
+                addbyte(0x8b); /*MOV EDX, state->x[EDI]*/
+                addbyte(0x97);
+                if (params->aux_tiled)
+                        addlong(offsetof(voodoo_state_t, x_tiled));
+                else
+                        addlong(offsetof(voodoo_state_t, x));
+                addbyte(0x66); /*MOV AX, new_depth*/
+                addbyte(0x8b);
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, new_depth));
+                addbyte(0x48); /*MOV RSI, aux_mem*/
+                addbyte(0x8b);
+                addbyte(0xb7);
+                addlong(offsetof(voodoo_state_t, aux_mem));
+                addbyte(0x66); /*MOV [ESI+EDX*2], AX*/
+                addbyte(0x89);
+                addbyte(0x04);
+                addbyte(0x56);
+        }
+
+        if (z_skip_pos)
+                *(uint32_t *)&code_block[z_skip_pos] = (block_pos - z_skip_pos) - 4;
+        if (a_skip_pos)
+                *(uint32_t *)&code_block[a_skip_pos] = (block_pos - a_skip_pos) - 4;
+        if (chroma_skip_pos)
+                *(uint32_t *)&code_block[chroma_skip_pos] = (block_pos - chroma_skip_pos) - 4;
+
+        addbyte(0x4c); /*MOV RSI, R15*/
+        addbyte(0x89);
+        addbyte(0xfe);
+
+        addbyte(0xf3); /*MOVDQU XMM1, state->ib[EDI]*/
+        addbyte(0x0f);
+        addbyte(0x6f);
+        addbyte(0x8f);
+        addlong(offsetof(voodoo_state_t, ib));
+        addbyte(0xf3); /*MOVDQU XMM3, state->tmu0_s[EDI]*/
+        addbyte(0x0f);
+        addbyte(0x6f);
+        addbyte(0x9f);
+        addlong(offsetof(voodoo_state_t, tmu0_s));
+        addbyte(0xf3); /*MOVQ XMM4, state->tmu0_w[EDI]*/
+        addbyte(0x0f);
+        addbyte(0x7e);
+        addbyte(0xa7);
+        addlong(offsetof(voodoo_state_t, tmu0_w));
+        addbyte(0xf3); /*MOVDQU XMM0, params->dBdX[ESI]*/
+        addbyte(0x0f);
+        addbyte(0x6f);
+        addbyte(0x86);
+        addlong(offsetof(voodoo_params_t, dBdX));       
+        addbyte(0x8b); /*MOV EAX, params->dZdX[ESI]*/
+        addbyte(0x86);
+        addlong(offsetof(voodoo_params_t, dZdX));
+        addbyte(0xf3); /*MOVDQU XMM5, params->tmu[0].dSdX[ESI]*/
+        addbyte(0x0f);
+        addbyte(0x6f);
+        addbyte(0xae);
+        addlong(offsetof(voodoo_params_t, tmu[0].dSdX));
+        addbyte(0xf3); /*MOVQ XMM6, params->tmu[0].dWdX[ESI]*/
+        addbyte(0x0f);
+        addbyte(0x7e);
+        addbyte(0xb6);
+        addlong(offsetof(voodoo_params_t, tmu[0].dWdX));
+
+        if (state->xdir > 0)
+        {
+                addbyte(0x66); /*PADDD XMM1, XMM0*/
+                addbyte(0x0f);
+                addbyte(0xfe);
+                addbyte(0xc8);
+        }
+        else
+        {
+                addbyte(0x66); /*PSUBD XMM1, XMM0*/
+                addbyte(0x0f);
+                addbyte(0xfa);
+                addbyte(0xc8);
+        }
+
+        addbyte(0xf3); /*MOVQ XMM0, state->w*/
+        addbyte(0x0f);
+        addbyte(0x7e);
+        addbyte(0x87);
+        addlong(offsetof(voodoo_state_t, w));
+        addbyte(0xf3); /*MOVDQU state->ib, XMM1*/
+        addbyte(0x0f);
+        addbyte(0x7f);
+        addbyte(0x8f);
+        addlong(offsetof(voodoo_state_t, ib));
+        addbyte(0xf3); /*MOVQ XMM7, params->dWdX*/
+        addbyte(0x0f);
+        addbyte(0x7e);
+        addbyte(0xbe);
+        addlong(offsetof(voodoo_params_t, dWdX));
+
+        if (state->xdir > 0)
+        {
+                addbyte(0x66); /*PADDQ XMM3, XMM5*/
+                addbyte(0x0f);
+                addbyte(0xd4);
+                addbyte(0xdd);
+                addbyte(0x66); /*PADDQ XMM4, XMM6*/
+                addbyte(0x0f);
+                addbyte(0xd4);
+                addbyte(0xe6);
+                addbyte(0x66); /*PADDQ XMM0, XMM7*/
+                addbyte(0x0f);
+                addbyte(0xd4);
+                addbyte(0xc7);
+                addbyte(0x01); /*ADD state->z[EDI], EAX*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, z));        
+        }
+        else
+        {
+                addbyte(0x66); /*PSUBQ XMM3, XMM5*/
+                addbyte(0x0f);
+                addbyte(0xfb);
+                addbyte(0xdd);
+                addbyte(0x66); /*PSUBQ XMM4, XMM6*/
+                addbyte(0x0f);
+                addbyte(0xfb);
+                addbyte(0xe6);
+                addbyte(0x66); /*PSUBQ XMM0, XMM7*/
+                addbyte(0x0f);
+                addbyte(0xfb);
+                addbyte(0xc7);
+                addbyte(0x29); /*SUB state->z[EDI], EAX*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, z));        
+        }
+
+        if (voodoo->dual_tmus)
+        {
+                addbyte(0xf3); /*MOVDQU XMM5, params->tmu[1].dSdX[ESI]*/
+                addbyte(0x0f);
+                addbyte(0x6f);
+                addbyte(0xae);
+                addlong(offsetof(voodoo_params_t, tmu[1].dSdX));
+                addbyte(0xf3); /*MOVQ XMM6, params->tmu[1].dWdX[ESI]*/
+                addbyte(0x0f);
+                addbyte(0x7e);
+                addbyte(0xb6);
+                addlong(offsetof(voodoo_params_t, tmu[1].dWdX));
+        }
+
+        addbyte(0xf3); /*MOVDQU state->tmu0_s, XMM3*/
+        addbyte(0x0f);
+        addbyte(0x7f);
+        addbyte(0x9f);
+        addlong(offsetof(voodoo_state_t, tmu0_s));
+        addbyte(0x66); /*MOVQ state->tmu0_w, XMM4*/
+        addbyte(0x0f);
+        addbyte(0xd6);
+        addbyte(0xa7);
+        addlong(offsetof(voodoo_state_t, tmu0_w));
+        addbyte(0x66); /*MOVQ state->w, XMM0*/
+        addbyte(0x0f);
+        addbyte(0xd6);
+        addbyte(0x87);
+        addlong(offsetof(voodoo_state_t, w));
+
+        if (voodoo->dual_tmus)
+        {
+                addbyte(0xf3); /*MOVDQU XMM3, state->tmu1_s[EDI]*/
+                addbyte(0x0f);
+                addbyte(0x6f);
+                addbyte(0x9f);
+                addlong(offsetof(voodoo_state_t, tmu1_s));
+                addbyte(0xf3); /*MOVQ XMM4, state->tmu1_w[EDI]*/
+                addbyte(0x0f);
+                addbyte(0x7e);
+                addbyte(0xa7);
+                addlong(offsetof(voodoo_state_t, tmu1_w));
+
+                if (state->xdir > 0)
+                {
+                        addbyte(0x66); /*PADDQ XMM3, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xd4);
+                        addbyte(0xdd);
+                        addbyte(0x66); /*PADDQ XMM4, XMM6*/
+                        addbyte(0x0f);
+                        addbyte(0xd4);
+                        addbyte(0xe6);
+                }
+                else
+                {
+                        addbyte(0x66); /*PSUBQ XMM3, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfb);
+                        addbyte(0xdd);
+                        addbyte(0x66); /*PSUBQ XMM4, XMM6*/
+                        addbyte(0x0f);
+                        addbyte(0xfb);
+                        addbyte(0xe6);
+                }
+        
+                addbyte(0xf3); /*MOVDQU state->tmu1_s, XMM3*/
+                addbyte(0x0f);
+                addbyte(0x7f);
+                addbyte(0x9f);
+                addlong(offsetof(voodoo_state_t, tmu1_s));
+                addbyte(0x66); /*MOVQ state->tmu1_w, XMM4*/
+                addbyte(0x0f);
+                addbyte(0xd6);
+                addbyte(0xa7);
+                addlong(offsetof(voodoo_state_t, tmu1_w));
+        }
+        
+        addbyte(0x83); /*ADD state->pixel_count[EDI], 1*/
+        addbyte(0x87);
+        addlong(offsetof(voodoo_state_t, pixel_count));
+        addbyte(1);
+
+        if (params->fbzColorPath & FBZCP_TEXTURE_ENABLED)
+        {
+                if ((params->textureMode[0] & TEXTUREMODE_MASK) == TEXTUREMODE_PASSTHROUGH ||
+                    (params->textureMode[0] & TEXTUREMODE_LOCAL_MASK) == TEXTUREMODE_LOCAL)
+                {
+                        addbyte(0x83); /*ADD state->texel_count[EDI], 1*/
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, texel_count));
+                        addbyte(1);
+                }
+                else
+                {
+                        addbyte(0x83); /*ADD state->texel_count[EDI], 2*/
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, texel_count));
+                        addbyte(2);
+                }                
+        }
+
+        addbyte(0x8b); /*MOV EAX, state->x[EDI]*/
+        addbyte(0x87);
+        addlong(offsetof(voodoo_state_t, x));
+        
+        if (state->xdir > 0)
+        {
+                addbyte(0x83); /*ADD state->x[EDI], 1*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, x));
+                addbyte(1);
+        }
+        else
+        {
+                addbyte(0x83); /*SUB state->x[EDI], 1*/
+                addbyte(0xaf);
+                addlong(offsetof(voodoo_state_t, x));
+                addbyte(1);
+        }
+
+        addbyte(0x3b); /*CMP EAX, state->x2[EDI]*/
+        addbyte(0x87);
+        addlong(offsetof(voodoo_state_t, x2));
+        addbyte(0x0f); /*JNZ loop_jump_pos*/
+        addbyte(0x85);
+        addlong(loop_jump_pos - (block_pos + 4));
+
+        addbyte(0x41); /*POP R15*/
+        addbyte(0x5f);        
+        addbyte(0x41); /*POP R14*/
+        addbyte(0x5e);
+        addbyte(0x41); /*POP R13*/
+        addbyte(0x5d);        
+        addbyte(0x41); /*POP R12*/
+        addbyte(0x5c);
+        addbyte(0x5b); /*POP RBX*/        
+        addbyte(0x5e); /*POP RSI*/
+        addbyte(0x5f); /*POP RDI*/
+        addbyte(0x5d); /*POP RBP*/
+        
+        addbyte(0xC3); /*RET*/
+}
+int voodoo_recomp = 0;
+static inline void *voodoo_get_block(voodoo_t *voodoo, voodoo_params_t *params, voodoo_state_t *state, int odd_even)
+{
+        int c;
+        int b = last_block[odd_even];
+        voodoo_x86_data_t *voodoo_x86_data = voodoo->codegen_data;
+        voodoo_x86_data_t *data;
+        
+        for (c = 0; c < 8; c++)
+        {
+                data = &voodoo_x86_data[odd_even + c*4]; //&voodoo_x86_data[odd_even][b];
+                
+                if (state->xdir == data->xdir &&
+                    params->alphaMode == data->alphaMode &&
+                    params->fbzMode == data->fbzMode &&
+                    params->fogMode == data->fogMode &&
+                    params->fbzColorPath == data->fbzColorPath &&
+                    (voodoo->trexInit1[0] & (1 << 18)) == data->trexInit1 &&
+                    params->textureMode[0] == data->textureMode[0] &&
+                    params->textureMode[1] == data->textureMode[1] &&
+                    (params->tLOD[0] & LOD_MASK) == data->tLOD[0] &&
+                    (params->tLOD[1] & LOD_MASK) == data->tLOD[1] &&
+                    ((params->col_tiled || params->aux_tiled) ? 1 : 0) == data->is_tiled)
+                {
+                        last_block[odd_even] = b;
+                        return data->code_block;
+                }
+                
+                b = (b + 1) & 7;
+        }
+voodoo_recomp++;
+        data = &voodoo_x86_data[odd_even + next_block_to_write[odd_even]*4];
+//        code_block = data->code_block;
+        
+        voodoo_generate(data->code_block, voodoo, params, state, depth_op);
+
+        data->xdir = state->xdir;
+        data->alphaMode = params->alphaMode;
+        data->fbzMode = params->fbzMode;
+        data->fogMode = params->fogMode;
+        data->fbzColorPath = params->fbzColorPath;
+        data->trexInit1 = voodoo->trexInit1[0] & (1 << 18);
+        data->textureMode[0] = params->textureMode[0];
+        data->textureMode[1] = params->textureMode[1];
+        data->tLOD[0] = params->tLOD[0] & LOD_MASK;
+        data->tLOD[1] = params->tLOD[1] & LOD_MASK;
+        data->is_tiled = (params->col_tiled || params->aux_tiled) ? 1 : 0;
+
+        next_block_to_write[odd_even] = (next_block_to_write[odd_even] + 1) & 7;
+        
+        return data->code_block;
+}
+
+void voodoo_codegen_init(voodoo_t *voodoo)
+{
+        int c;
+
+#if WIN64
+        voodoo->codegen_data = VirtualAlloc(NULL, sizeof(voodoo_x86_data_t) * BLOCK_NUM * 4, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
+#else
+        voodoo->codegen_data = mmap(0, sizeof(voodoo_x86_data_t) * BLOCK_NUM*4, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_PRIVATE, 0, 0);
+#endif
+
+        for (c = 0; c < 256; c++)
+        {
+                int d[4];
+                int _ds = c & 0xf;
+                int dt = c >> 4;
+                
+                alookup[c] = _mm_set_epi32(0, 0, c | (c << 16), c | (c << 16));
+                aminuslookup[c] = _mm_set_epi32(0, 0, (255-c) | ((255-c) << 16), (255-c) | ((255-c) << 16));
+
+                d[0] = (16 - _ds) * (16 - dt);
+                d[1] =  _ds * (16 - dt);
+                d[2] = (16 - _ds) * dt;
+                d[3] = _ds * dt;
+
+                bilinear_lookup[c*2]     = _mm_set_epi32(d[1] | (d[1] << 16), d[1] | (d[1] << 16), d[0] | (d[0] << 16), d[0] | (d[0] << 16));
+                bilinear_lookup[c*2 + 1] = _mm_set_epi32(d[3] | (d[3] << 16), d[3] | (d[3] << 16), d[2] | (d[2] << 16), d[2] | (d[2] << 16));
+        }
+        alookup[256] = _mm_set_epi32(0, 0, 256 | (256 << 16), 256 | (256 << 16));
+        xmm_00_ff_w[0] = _mm_set_epi32(0, 0, 0, 0);
+        xmm_00_ff_w[1] = _mm_set_epi32(0, 0, 0xff | (0xff << 16), 0xff | (0xff << 16));
+}
+
+void voodoo_codegen_close(voodoo_t *voodoo)
+{
+#if WIN64
+        VirtualFree(voodoo->codegen_data, 0, MEM_RELEASE);
+#else
+        munmap(voodoo->codegen_data, sizeof(voodoo_x86_data_t) * BLOCK_NUM*4);
+#endif
+}
+
diff --git a/pcem/vid_voodoo_codegen_x86.h b/pcem/vid_voodoo_codegen_x86.h
new file mode 100644 (file)
index 0000000..c925d5b
--- /dev/null
@@ -0,0 +1,3412 @@
+/*Registers :
+        
+  alphaMode
+  fbzMode & 0x1f3fff
+  fbzColorPath
+*/
+
+#if defined(__linux__) || defined(__APPLE__)
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+#if defined WIN32 || defined _WIN32 || defined _WIN32
+#define BITMAP windows_BITMAP
+#include <windows.h>
+#undef BITMAP
+#endif
+
+#include <xmmintrin.h>
+
+#define BLOCK_NUM 8
+#define BLOCK_MASK (BLOCK_NUM-1)
+#define BLOCK_SIZE 8192
+
+#define LOD_MASK (LOD_TMIRROR_S | LOD_TMIRROR_T)
+
+typedef struct voodoo_x86_data_t
+{
+        uint8_t code_block[BLOCK_SIZE];
+        int xdir;
+        uint32_t alphaMode;
+        uint32_t fbzMode;
+        uint32_t fogMode;
+        uint32_t fbzColorPath;
+        uint32_t textureMode[2];
+        uint32_t tLOD[2];
+        uint32_t trexInit1;        
+        int is_tiled;
+} voodoo_x86_data_t;
+
+static int last_block[4] = {0, 0};
+static int next_block_to_write[4] = {0, 0};
+
+#define addbyte(val)                                            \
+        do {                                                    \
+                code_block[block_pos++] = val;                  \
+                if (block_pos >= BLOCK_SIZE)                    \
+                        fatal("Over!\n");                       \
+        } while (0)
+
+#define addword(val)                                            \
+        do {                                                    \
+                *(uint16_t *)&code_block[block_pos] = val;      \
+                block_pos += 2;                                 \
+                if (block_pos >= BLOCK_SIZE)                    \
+                        fatal("Over!\n");                       \
+        } while (0)
+
+#define addlong(val)                                            \
+        do {                                                    \
+                *(uint32_t *)&code_block[block_pos] = val;      \
+                block_pos += 4;                                 \
+                if (block_pos >= BLOCK_SIZE)                    \
+                        fatal("Over!\n");                       \
+        } while (0)
+
+#define addquad(val)                                            \
+        do {                                                    \
+                *(uint64_t *)&code_block[block_pos] = val;      \
+                block_pos += 8;                                 \
+                if (block_pos >= BLOCK_SIZE)                    \
+                        fatal("Over!\n");                       \
+        } while (0)
+
+
+static __m128i xmm_01_w;// = 0x0001000100010001ull;
+static __m128i xmm_ff_w;// = 0x00ff00ff00ff00ffull;
+static __m128i xmm_ff_b;// = 0x00000000ffffffffull;
+
+static uint32_t zero = 0;
+static double const_1_48 = (double)(1ull << 4);
+
+static __m128i alookup[257], aminuslookup[256];
+static __m128i minus_254;// = 0xff02ff02ff02ff02ull;
+static __m128i bilinear_lookup[256*2];
+static __m128i xmm_00_ff_w[2];
+static uint32_t i_00_ff_w[2] = {0, 0xff};
+
+static inline int codegen_texture_fetch(uint8_t *code_block, voodoo_t *voodoo, voodoo_params_t *params, voodoo_state_t *state, int block_pos, int tmu)
+{
+        if (params->textureMode[tmu] & 1)
+        {
+                addbyte(0xdf); /*FILDq state->tmu0_w*/
+                addbyte(0xaf);
+                addlong(tmu ? offsetof(voodoo_state_t, tmu1_w) : offsetof(voodoo_state_t, tmu0_w));
+                addbyte(0xdd); /*FLDq const_1_48*/
+                addbyte(0x05);
+                addlong((uint32_t)&const_1_48);
+                addbyte(0xde); /*FDIV ST(1)*/
+                addbyte(0xf1);
+                addbyte(0xdf); /*FILDq state->tmu0_s*/
+                addbyte(0xaf);
+                addlong(tmu ? offsetof(voodoo_state_t, tmu1_s) : offsetof(voodoo_state_t, tmu0_s));
+                addbyte(0xdf); /*FILDq state->tmu0_t*/ /*ST(0)=t,   ST(1)=s,   ST(2)=1/w*/
+                addbyte(0xaf);
+                addlong(tmu ? offsetof(voodoo_state_t, tmu1_t) : offsetof(voodoo_state_t, tmu0_t));
+                addbyte(0xd9); /*FXCH ST(1)*/          /*ST(0)=s,   ST(1)=t,   ST(2)=1/w*/
+                addbyte(0xc9);
+                addbyte(0xd8); /*FMUL ST(2)*/          /*ST(0)=s/w, ST(1)=t,   ST(2)=1/w*/
+                addbyte(0xca);
+                addbyte(0xd9); /*FXCH ST(1)*/          /*ST(0)=t,   ST(1)=s/w, ST(2)=1/w*/
+                addbyte(0xc9);
+                addbyte(0xd8); /*FMUL ST(2)*/          /*ST(0)=t/w, ST(1)=s/w, ST(2)=1/w*/
+                addbyte(0xca);
+                addbyte(0xd9); /*FXCH ST(2)*/          /*ST(0)=1/w, ST(1)=s/w, ST(2)=t/w*/
+                addbyte(0xca);
+                addbyte(0xd9); /*FSTPs log_temp*/      /*ST(0)=s/w, ST(1)=t/w*/
+                addbyte(0x9f);
+                addlong(offsetof(voodoo_state_t, log_temp));
+                addbyte(0xdf); /*FSITPq state->tex_s*/
+                addbyte(0xbf);
+                addlong(offsetof(voodoo_state_t, tex_s));
+                addbyte(0x8b); /*MOV EAX, log_temp*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, log_temp));
+                addbyte(0xdf); /*FSITPq state->tex_t*/
+                addbyte(0xbf);
+                addlong(offsetof(voodoo_state_t, tex_t));
+                addbyte(0xc1); /*SHR EAX, 23-8*/
+                addbyte(0xe8);
+                addbyte(15);
+                addbyte(0x0f); /*MOVZX EBX, AL*/
+                addbyte(0xb6);
+                addbyte(0xd8);
+                addbyte(0x25); /*AND EAX, 0xff00*/
+                addlong(0xff00);
+                addbyte(0x2d); /*SUB EAX, (127-44)<<8*/
+                addlong((127-44+19) << 8);
+                addbyte(0x0f); /*MOVZX EBX, logtable[EBX]*/
+                addbyte(0xb6);
+                addbyte(0x9b);
+                addlong((uint32_t)logtable);
+                addbyte(0x09); /*OR EAX, EBX*/
+                addbyte(0xd8);
+                addbyte(0x03); /*ADD EAX, state->lod*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, tmu[tmu].lod));
+                addbyte(0x3b); /*CMP EAX, state->lod_min*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, lod_min[tmu]));
+                addbyte(0x0f); /*CMOVL EAX, state->lod_min*/
+                addbyte(0x4c);
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, lod_min[tmu]));
+                addbyte(0x3b); /*CMP EAX, state->lod_max*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, lod_max[tmu]));
+                addbyte(0x0f); /*CMOVNL EAX, state->lod_max*/
+                addbyte(0x4d);
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, lod_max[tmu]));
+                addbyte(0x0f); /*MOVZX EBX, AL*/
+                addbyte(0xb6);
+                addbyte(0xd8);
+                addbyte(0xc1); /*SHR EAX, 8*/
+                addbyte(0xe8);
+                addbyte(8);
+                addbyte(0x89); /*MOV state->lod_frac[tmu], EBX*/
+                addbyte(0x9f);
+                addlong(offsetof(voodoo_state_t, lod_frac[tmu]));
+                addbyte(0x89); /*MOV state->lod, EAX*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, lod));
+        }
+        else
+        {
+                addbyte(0xf3); /*MOVQ XMM4, state->tmu0_s*/
+                addbyte(0x0f);
+                addbyte(0x7e);
+                addbyte(0xa7);
+                addlong(tmu ? offsetof(voodoo_state_t, tmu1_s) : offsetof(voodoo_state_t, tmu0_s));
+                addbyte(0xf3); /*MOVQ XMM5, state->tmu0_t*/
+                addbyte(0x0f);
+                addbyte(0x7e);
+                addbyte(0xaf);
+                addlong(tmu ? offsetof(voodoo_state_t, tmu1_t) : offsetof(voodoo_state_t, tmu0_t));
+                addbyte(0xc7); /*MOV state->lod[tmu], 0*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, lod_frac[tmu]));
+                addlong(0);
+                addbyte(0x8b); /*MOV EAX, state->lod_min*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, lod_min[tmu]));
+                addbyte(0x66); /*SHRQ XMM4, 28*/
+                addbyte(0x0f);
+                addbyte(0x73);
+                addbyte(0xd4);
+                addbyte(28);
+                addbyte(0x66); /*SHRQ XMM5, 28*/
+                addbyte(0x0f);
+                addbyte(0x73);
+                addbyte(0xd5);
+                addbyte(28);
+                addbyte(0x0f); /*MOVZX EBX, AL*/
+                addbyte(0xb6);
+                addbyte(0xd8);
+                addbyte(0xc1); /*SHR EAX, 8*/
+                addbyte(0xe8);
+                addbyte(8);        
+                addbyte(0x66); /*MOVQ state->tex_s, XMM4*/
+                addbyte(0x0f);
+                addbyte(0xd6);
+                addbyte(0xa7);
+                addlong(offsetof(voodoo_state_t, tex_s));
+                addbyte(0x66); /*MOVQ state->tex_t, XMM5*/
+                addbyte(0x0f);
+                addbyte(0xd6);
+                addbyte(0xaf);
+                addlong(offsetof(voodoo_state_t, tex_t));
+                addbyte(0x89); /*MOV state->lod_frac[tmu], EBX*/
+                addbyte(0x9f);
+                addlong(offsetof(voodoo_state_t, lod_frac[tmu]));
+                addbyte(0x89); /*MOV state->lod, EAX*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, lod));
+        }
+        /*EAX = state->lod*/
+        if (params->fbzColorPath & FBZCP_TEXTURE_ENABLED)
+        {
+                if (voodoo->bilinear_enabled && (params->textureMode[tmu] & 6))
+                {
+                        addbyte(0x8b); /*MOV ECX, state->tex_lod[tmu]*/
+                        addbyte(0x8f);
+                        addlong(offsetof(voodoo_state_t, tex_lod[tmu]));
+                        addbyte(0xb2); /*MOV DL, 8*/
+                        addbyte(8);
+                        addbyte(0x8b); /*MOV ECX, [ECX+EAX*4]*/
+                        addbyte(0x0c);
+                        addbyte(0x81);
+                        addbyte(0xbd); /*MOV EBP, 8*/
+                        addlong(8);
+                        addbyte(0x28); /*SUB DL, CL*/
+                        addbyte(0xca);
+                        addbyte(0xd3); /*SHL EBP, CL*/
+                        addbyte(0xe5);
+                        addbyte(0x8b); /*MOV EAX, state->tex_s[EDI]*/
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, tex_s));
+                        addbyte(0x8b); /*MOV EBX, state->tex_t[EDI]*/
+                        addbyte(0x9f);
+                        addlong(offsetof(voodoo_state_t, tex_t));
+                        if (params->tLOD[tmu] & LOD_TMIRROR_S)
+                        {
+                                addbyte(0xa9); /*TEST EAX, 0x1000*/
+                                addlong(0x1000);
+                                addbyte(0x74); /*JZ +*/
+                                addbyte(2);
+                                addbyte(0xf7); /*NOT EAX*/
+                                addbyte(0xd0);
+                        }
+                        if (params->tLOD[tmu] & LOD_TMIRROR_T)
+                        {
+                                addbyte(0xf7); /*TEST EBX, 0x1000*/
+                                addbyte(0xc3);
+                                addlong(0x1000);
+                                addbyte(0x74); /*JZ +*/
+                                addbyte(2);
+                                addbyte(0xf7); /*NOT EBX*/
+                                addbyte(0xd3);
+                        }
+                        addbyte(0x29); /*SUB EAX, EBP*/
+                        addbyte(0xe8);
+                        addbyte(0x29); /*SUB EBX, EBP*/
+                        addbyte(0xeb);
+                        addbyte(0xd3); /*SAR EAX, CL*/
+                        addbyte(0xf8);
+                        addbyte(0xd3); /*SAR EBX, CL*/
+                        addbyte(0xfb);
+                        addbyte(0x89); /*MOV EBP, EAX*/
+                        addbyte(0xc5);
+                        addbyte(0x89); /*MOV ECX, EBX*/
+                        addbyte(0xd9);
+                        addbyte(0x83); /*AND EBP, 0xf*/
+                        addbyte(0xe5);
+                        addbyte(0xf);
+                        addbyte(0xc1); /*SHL ECX, 4*/
+                        addbyte(0xe1);
+                        addbyte(4);
+                        addbyte(0xc1); /*SAR EAX, 4*/
+                        addbyte(0xf8);
+                        addbyte(4);
+                        addbyte(0x81); /*AND ECX, 0xf0*/
+                        addbyte(0xe1);
+                        addlong(0xf0);
+                        addbyte(0xc1); /*SAR EBX, 4*/
+                        addbyte(0xfb);
+                        addbyte(4);
+                        addbyte(0x09); /*OR EBP, ECX*/
+                        addbyte(0xcd);
+                        addbyte(0x8b); /*MOV ECX, state->lod[EDI]*/
+                        addbyte(0x8f);
+                        addlong(offsetof(voodoo_state_t, lod));
+                        addbyte(0xc1); /*SHL EBP, 5*/
+                        addbyte(0xe5);
+                        addbyte(5);
+                        /*EAX = S, EBX = T, ECX = LOD, EDX = tex_shift, ESI=params, EDI=state, EBP = bilinear shift*/
+                        addbyte(0x8d); /*LEA ESI, [ESI+ECX*4]*/
+                        addbyte(0x34);
+                        addbyte(0x8e);
+                        addbyte(0x89); /*MOV ebp_store, EBP*/
+                        addbyte(0xaf);
+                        addlong(offsetof(voodoo_state_t, ebp_store));
+                        addbyte(0x8b); /*MOV EBP, state->tex[EDI+ECX*4]*/
+                        addbyte(0xac);
+                        addbyte(0x8f);
+                        addlong(offsetof(voodoo_state_t, tex[tmu]));
+                        addbyte(0x88); /*MOV CL, DL*/
+                        addbyte(0xd1);
+                        addbyte(0x89); /*MOV EDX, EBX*/
+                        addbyte(0xda);
+                        if (!state->clamp_s[tmu])
+                        {
+                                addbyte(0x23); /*AND EAX, params->tex_w_mask[ESI]*/
+                                addbyte(0x86);
+                                addlong(offsetof(voodoo_params_t, tex_w_mask[tmu]));
+                        }
+                        addbyte(0x83); /*ADD EDX, 1*/
+                        addbyte(0xc2);
+                        addbyte(1);
+                        if (state->clamp_t[tmu])
+                        {
+                                addbyte(0x0f); /*CMOVS EDX, zero*/
+                                addbyte(0x48);
+                                addbyte(0x15);
+                                addlong((uint32_t)&zero);
+                                addbyte(0x3b); /*CMP EDX, params->tex_h_mask[ESI]*/
+                                addbyte(0x96);
+                                addlong(offsetof(voodoo_params_t, tex_h_mask[tmu]));
+                                addbyte(0x0f); /*CMOVA EDX, params->tex_h_mask[ESI]*/
+                                addbyte(0x47);
+                                addbyte(0x96);
+                                addlong(offsetof(voodoo_params_t, tex_h_mask[tmu]));
+                                addbyte(0x85); /*TEST EBX,EBX*/
+                                addbyte(0xdb);
+                                addbyte(0x0f); /*CMOVS EBX, zero*/
+                                addbyte(0x48);
+                                addbyte(0x1d);
+                                addlong((uint32_t)&zero);
+                                addbyte(0x3b); /*CMP EBX, params->tex_h_mask[ESI]*/
+                                addbyte(0x9e);
+                                addlong(offsetof(voodoo_params_t, tex_h_mask[tmu]));
+                                addbyte(0x0f); /*CMOVA EBX, params->tex_h_mask[ESI]*/
+                                addbyte(0x47);
+                                addbyte(0x9e);
+                                addlong(offsetof(voodoo_params_t, tex_h_mask[tmu]));
+                        }
+                        else
+                        {
+                                addbyte(0x23); /*AND EDX, params->tex_h_mask[ESI]*/
+                                addbyte(0x96);
+                                addlong(offsetof(voodoo_params_t, tex_h_mask[tmu]));
+                                addbyte(0x23); /*AND EBX, params->tex_h_mask[ESI]*/
+                                addbyte(0x9e);
+                                addlong(offsetof(voodoo_params_t, tex_h_mask[tmu]));
+                        }
+                        /*EAX = S, EBX = T0, EDX = T1*/
+                        addbyte(0xd3); /*SHL EBX, CL*/
+                        addbyte(0xe3);
+                        addbyte(0xd3); /*SHL EDX, CL*/
+                        addbyte(0xe2);
+                        addbyte(0x8d); /*LEA EBX,[EBP+EBX*2]*/
+                        addbyte(0x5c);
+                        addbyte(0x9d);
+                        addbyte(0);
+                        addbyte(0x8d); /*LEA EDX,[EBP+EDX*2]*/
+                        addbyte(0x54);
+                        addbyte(0x95);
+                        addbyte(0);
+                        if (state->clamp_s[tmu])
+                        {
+                                addbyte(0x8b); /*MOV EBP, params->tex_w_mask[ESI]*/
+                                addbyte(0xae);
+                                addlong(offsetof(voodoo_params_t, tex_w_mask[tmu]));
+                                addbyte(0x85); /*TEST EAX, EAX*/
+                                addbyte(0xc0);
+                                addbyte(0x8b); /*MOV ESI, ebp_store*/
+                                addbyte(0xb7);
+                                addlong(offsetof(voodoo_state_t, ebp_store));
+                                addbyte(0x0f); /*CMOVS EAX, zero*/
+                                addbyte(0x48);
+                                addbyte(0x05);
+                                addlong((uint32_t)&zero);
+                                addbyte(0x78); /*JS + - clamp on 0*/
+                                addbyte(2+3+2+ 5+5+2);
+                                addbyte(0x3b); /*CMP EAX, EBP*/
+                                addbyte(0xc5);
+                                addbyte(0x0f); /*CMOVAE EAX, EBP*/
+                                addbyte(0x43);
+                                addbyte(0xc5);
+                                addbyte(0x73); /*JAE + - clamp on +*/
+                                addbyte(5+5+2);
+                        }
+                        else
+                        {
+                                addbyte(0x3b); /*CMP EAX, params->tex_w_mask[ESI] - is S at texture edge (ie will wrap/clamp)?*/
+                                addbyte(0x86);
+                                addlong(offsetof(voodoo_params_t, tex_w_mask[tmu]));
+                                addbyte(0x8b); /*MOV ESI, ebp_store*/
+                                addbyte(0xb7);
+                                addlong(offsetof(voodoo_state_t, ebp_store));
+                                addbyte(0x74); /*JE +*/
+                                addbyte(5+5+2);
+                        }
+
+                        addbyte(0xf3); /*MOVQ XMM0, [EBX+EAX*4]*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0x04);
+                        addbyte(0x83);
+                        addbyte(0xf3); /*MOVQ XMM1, [EDX+EAX*4]*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0x0c);
+                        addbyte(0x82);
+                                               
+                        if (state->clamp_s[tmu])
+                        {
+                                addbyte(0xeb); /*JMP +*/
+                                addbyte(5+5+4+4);
+
+                                /*S clamped - the two S coordinates are the same*/
+                                addbyte(0x66); /*MOVD XMM0, [EBX+EAX*4]*/
+                                addbyte(0x0f);
+                                addbyte(0x6e);
+                                addbyte(0x04);
+                                addbyte(0x83);
+                                addbyte(0x66); /*MOVD XMM1, [EDX+EAX*4]*/
+                                addbyte(0x0f);
+                                addbyte(0x6e);
+                                addbyte(0x0c);
+                                addbyte(0x82);
+                                addbyte(0x66); /*PUNPCKLDQ XMM0, XMM0*/
+                                addbyte(0x0f);
+                                addbyte(0x62);
+                                addbyte(0xc0);
+                                addbyte(0x66); /*PUNPCKLDQ XMM1, XMM1*/
+                                addbyte(0x0f);
+                                addbyte(0x62);
+                                addbyte(0xc9);
+                        }
+                        else
+                        {
+                                addbyte(0xeb); /*JMP +*/
+                                addbyte(5+5+5+5+6+6);
+
+                                /*S wrapped - the two S coordinates are not contiguous*/
+                                addbyte(0x66); /*MOVD XMM0, [EBX+EAX*4]*/
+                                addbyte(0x0f);
+                                addbyte(0x6e);
+                                addbyte(0x04);
+                                addbyte(0x83);
+                                addbyte(0x66); /*MOVD XMM1, [EDX+EAX*4]*/
+                                addbyte(0x0f);
+                                addbyte(0x6e);
+                                addbyte(0x0c);
+                                addbyte(0x82);
+                                addbyte(0x66); /*PINSRW XMM0, [EBX], 2*/
+                                addbyte(0x0f);
+                                addbyte(0xc4);
+                                addbyte(0x03);
+                                addbyte(0x02);
+                                addbyte(0x66); /*PINSRW XMM1, [EDX], 2*/
+                                addbyte(0x0f);
+                                addbyte(0xc4);
+                                addbyte(0x0a);
+                                addbyte(0x02);
+                                addbyte(0x66); /*PINSRW XMM0, 2[EBX], 3*/
+                                addbyte(0x0f);
+                                addbyte(0xc4);
+                                addbyte(0x43);
+                                addbyte(0x02);
+                                addbyte(0x03);
+                                addbyte(0x66); /*PINSRW XMM1, 2[EDX], 3*/
+                                addbyte(0x0f);
+                                addbyte(0xc4);
+                                addbyte(0x4a);
+                                addbyte(0x02);
+                                addbyte(0x03);
+                        }
+
+                        addbyte(0x66); /*PUNPCKLBW XMM0, XMM2*/
+                        addbyte(0x0f);
+                        addbyte(0x60);
+                        addbyte(0xc2);
+                        addbyte(0x66); /*PUNPCKLBW XMM1, XMM2*/
+                        addbyte(0x0f);
+                        addbyte(0x60);
+                        addbyte(0xca);
+                        
+                        addbyte(0x81); /*ADD ESI, bilinear_lookup*/
+                        addbyte(0xc6);
+                        addlong((uint32_t)bilinear_lookup);
+
+                        addbyte(0x66); /*PMULLW XMM0, bilinear_lookup[ESI]*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0x06);
+                        addbyte(0x66); /*PMULLW XMM1, bilinear_lookup[ESI]+0x10*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0x4e);
+                        addbyte(0x10);
+                        addbyte(0x66); /*PADDW XMM0, XMM1*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xc0 | 1 | (0 << 3));
+                        addbyte(0x66); /*MOV XMM1, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x6f);
+                        addbyte(0xc0 | 0 | (1 << 3));
+                        addbyte(0x66); /*PSRLDQ XMM0, 64*/
+                        addbyte(0x0f);
+                        addbyte(0x73);
+                        addbyte(0xd8);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM0, XMM1*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xc0 | 1 | (0 << 3));
+                        addbyte(0x66); /*PSRLW XMM0, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd0 | 0);
+                        addbyte(8);
+                        addbyte(0x66); /*PACKUSWB XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x67);
+                        addbyte(0xc0);
+                        
+                        addbyte(0x8b); /*MOV ESI, [ESP+8]*/
+                        addbyte(0x74);
+                        addbyte(0x24);
+                        addbyte(8+16); /*CHECK!*/
+
+                        addbyte(0x66); /*MOV EAX, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xc0);                        
+                }
+                else
+                {
+                        addbyte(0x8b); /*MOV ECX, state->tex_lod[tmu]*/
+                        addbyte(0x8f);
+                        addlong(offsetof(voodoo_state_t, tex_lod[tmu]));
+                        addbyte(0xb2); /*MOV DL, 8*/
+                        addbyte(8);
+                        addbyte(0x8b); /*MOV ECX, [ECX+EAX*4]*/
+                        addbyte(0x0c);
+                        addbyte(0x81);
+                        addbyte(0x8b); /*MOV EBP, state->tex[EDI+ECX*4]*/
+                        addbyte(0xac);
+                        addbyte(0x8f);
+                        addlong(offsetof(voodoo_state_t, tex[tmu]));
+                        addbyte(0x28); /*SUB DL, CL*/
+                        addbyte(0xca);
+                        addbyte(0x80); /*ADD CL, 4*/
+                        addbyte(0xc1);
+                        addbyte(4);
+                        addbyte(0x8b); /*MOV EAX, state->tex_s[EDI]*/
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, tex_s));
+                        addbyte(0x8b); /*MOV EBX, state->tex_t[EDI]*/
+                        addbyte(0x9f);
+                        addlong(offsetof(voodoo_state_t, tex_t));
+                        if (params->tLOD[tmu] & LOD_TMIRROR_S)
+                        {
+                                addbyte(0xa9); /*TEST EAX, 0x1000*/
+                                addlong(0x1000);
+                                addbyte(0x74); /*JZ +*/
+                                addbyte(2);
+                                addbyte(0xf7); /*NOT EAX*/
+                                addbyte(0xd0);
+                        }
+                        if (params->tLOD[tmu] & LOD_TMIRROR_T)
+                        {
+                                addbyte(0xf7); /*TEST EBX, 0x1000*/
+                                addbyte(0xc3);
+                                addlong(0x1000);
+                                addbyte(0x74); /*JZ +*/
+                                addbyte(2);
+                                addbyte(0xf7); /*NOT EBX*/
+                                addbyte(0xd3);
+                        }
+                        addbyte(0xd3); /*SHR EAX, CL*/
+                        addbyte(0xe8);
+                        addbyte(0xd3); /*SHR EBX, CL*/
+                        addbyte(0xeb);
+                        if (state->clamp_s[tmu])
+                        {
+                                addbyte(0x85); /*TEST EAX, EAX*/
+                                addbyte(0xc0);
+                                addbyte(0x0f); /*CMOVS EAX, zero*/
+                                addbyte(0x48);
+                                addbyte(0x05);
+                                addlong((uint32_t)&zero);
+                                addbyte(0x3b); /*CMP EAX, params->tex_w_mask[ESI+ECX*4]*/
+                                addbyte(0x84);
+                                addbyte(0x8e);
+                                addlong(offsetof(voodoo_params_t, tex_w_mask[tmu]) - 0x10);
+                                addbyte(0x0f); /*CMOVAE EAX, params->tex_w_mask[ESI+ECX*4]*/
+                                addbyte(0x43);
+                                addbyte(0x84);
+                                addbyte(0x8e);
+                                addlong(offsetof(voodoo_params_t, tex_w_mask[tmu]) - 0x10);
+
+                        }
+                        else
+                        {
+                                addbyte(0x23); /*AND EAX, params->tex_w_mask-0x10[ESI+ECX*4]*/
+                                addbyte(0x84);
+                                addbyte(0x8e);
+                                addlong(offsetof(voodoo_params_t, tex_w_mask[tmu]) - 0x10);
+                        }
+                        if (state->clamp_t[tmu])
+                        {
+                                addbyte(0x85); /*TEST EBX, EBX*/
+                                addbyte(0xdb);
+                                addbyte(0x0f); /*CMOVS EBX, zero*/
+                                addbyte(0x48);
+                                addbyte(0x1d);
+                                addlong((uint32_t)&zero);
+                                addbyte(0x3b); /*CMP EBX, params->tex_h_mask[ESI+ECX*4]*/
+                                addbyte(0x9c);
+                                addbyte(0x8e);
+                                addlong(offsetof(voodoo_params_t, tex_h_mask[tmu]) - 0x10);
+                                addbyte(0x0f); /*CMOVAE EBX, params->tex_h_mask[ESI+ECX*4]*/
+                                addbyte(0x43);
+                                addbyte(0x9c);
+                                addbyte(0x8e);
+                                addlong(offsetof(voodoo_params_t, tex_h_mask[tmu]) - 0x10);
+                        }
+                        else
+                        {
+                                addbyte(0x23); /*AND EBX, params->tex_h_mask-0x10[ESI+ECX*4]*/
+                                addbyte(0x9c);
+                                addbyte(0x8e);
+                                addlong(offsetof(voodoo_params_t, tex_h_mask[tmu]) - 0x10);
+                        }
+                        addbyte(0x88); /*MOV CL, DL*/
+                        addbyte(0xd1);
+                        addbyte(0xd3); /*SHL EBX, CL*/
+                        addbyte(0xe3);
+                        addbyte(0x01); /*ADD EBX, EAX*/
+                        addbyte(0xc3);
+
+                        addbyte(0x8b); /*MOV EAX,[EBP+EBX*4]*/
+                        addbyte(0x44);
+                        addbyte(0x9d);
+                        addbyte(0);
+                }
+        }
+
+        return block_pos;
+}
+
+static inline void voodoo_generate(uint8_t *code_block, voodoo_t *voodoo, voodoo_params_t *params, voodoo_state_t *state, int depthop)
+{        
+        int block_pos = 0;
+        int z_skip_pos = 0;
+        int a_skip_pos = 0;
+        int chroma_skip_pos = 0;
+        int depth_jump_pos = 0;
+        int depth_jump_pos2 = 0;
+        int loop_jump_pos = 0;
+//        xmm_01_w = (__m128i)0x0001000100010001ull;
+//        xmm_ff_w = (__m128i)0x00ff00ff00ff00ffull;
+//        xmm_ff_b = (__m128i)0x00000000ffffffffull;
+        xmm_01_w = _mm_set_epi32(0, 0, 0x00010001, 0x00010001);
+        xmm_ff_w = _mm_set_epi32(0, 0, 0x00ff00ff, 0x00ff00ff);
+        xmm_ff_b = _mm_set_epi32(0, 0, 0, 0x00ffffff);
+        minus_254 = _mm_set_epi32(0, 0, 0xff02ff02, 0xff02ff02);
+//        *(uint64_t *)&const_1_48 = 0x45b0000000000000ull;
+//        block_pos = 0;
+//        voodoo_get_depth = &code_block[block_pos];
+        /*W at (%esp+4)
+          Z at (%esp+12)
+          new_depth at (%esp+16)*/
+//        if ((params->fbzMode & FBZ_DEPTH_ENABLE) && (depth_op == DEPTHOP_NEVER))
+//        {
+//                addbyte(0xC3); /*RET*/
+//                return;
+//        }
+        addbyte(0x55); /*PUSH EBP*/
+        addbyte(0x57); /*PUSH EDI*/
+        addbyte(0x56); /*PUSH ESI*/
+        addbyte(0x53); /*PUSH EBX*/
+        
+        addbyte(0x8b); /*MOV EDI, [ESP+4]*/
+        addbyte(0x7c);
+        addbyte(0x24);
+        addbyte(4+16);
+        loop_jump_pos = block_pos;
+        addbyte(0x8b); /*MOV ESI, [ESP+8]*/
+        addbyte(0x74);
+        addbyte(0x24);
+        addbyte(8+16);
+        if (params->col_tiled || params->aux_tiled)
+        {
+                addbyte(0x8b); /*MOV EAX, state->x[EDI]*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, x));
+                addbyte(0x89); /*MOV EBX, EAX*/
+                addbyte(0xc3);
+                addbyte(0x83); /*AND EAX, 63*/
+                addbyte(0xe0);
+                addbyte(63);
+                addbyte(0xc1); /*SHR EBX, 6*/
+                addbyte(0xeb);
+                addbyte(6);
+                addbyte(0xc1); /*SHL EBX, 11  - tile is 128*32, << 12, div 2 because word index*/
+                addbyte(0xe3);
+                addbyte(11);
+                addbyte(0x01); /*ADD EAX, EBX*/
+                addbyte(0xd8);
+                addbyte(0x89); /*MOV state->x_tiled[EDI], EAX*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, x_tiled));
+        }
+        addbyte(0x66); /*PXOR XMM2, XMM2*/
+        addbyte(0x0f);
+        addbyte(0xef);
+        addbyte(0xd2);
+
+        if ((params->fbzMode & FBZ_W_BUFFER) || (params->fogMode & (FOG_ENABLE|FOG_CONSTANT|FOG_Z|FOG_ALPHA)) == FOG_ENABLE)
+        {
+                addbyte(0xb8); /*MOV new_depth, 0*/
+                addlong(0);
+                addbyte(0x66); /*TEST w+4, 0xffff*/
+                addbyte(0xf7);
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, w)+4);
+                addword(0xffff);
+                addbyte(0x75); /*JNZ got_depth*/
+                depth_jump_pos = block_pos;
+                addbyte(0);
+//                addbyte(4+5+2+3+2+5+5+3+2+2+2+/*3+*/3+2+6+4+5+2+3);
+                addbyte(0x8b); /*MOV EDX, w*/
+                addbyte(0x97);
+                addlong(offsetof(voodoo_state_t, w));
+                addbyte(0xb8); /*MOV new_depth, 0xf001*/
+                addlong(0xf001);
+                addbyte(0x89); /*MOV EBX, EDX*/
+                addbyte(0xd3);
+                addbyte(0xc1); /*SHR EDX, 16*/
+                addbyte(0xea);
+                addbyte(16);
+                addbyte(0x74); /*JZ got_depth*/
+                depth_jump_pos2 = block_pos;
+                addbyte(0);
+//                addbyte(5+5+3+2+2+2+/*3+*/3+2+6+4+5+2+3);
+                addbyte(0xb9); /*MOV ECX, 19*/
+                addlong(19);
+                addbyte(0x0f); /*BSR EAX, EDX*/
+                addbyte(0xbd);
+                addbyte(0xc2);
+                addbyte(0xba); /*MOV EDX, 15*/
+                addlong(15);
+                addbyte(0xf7); /*NOT EBX*/
+                addbyte(0xd3);
+                addbyte(0x29); /*SUB EDX, EAX - EDX = exp*/
+                addbyte(0xc2);
+                addbyte(0x29); /*SUB ECX, EDX*/
+                addbyte(0xd1);
+                addbyte(0xc1); /*SHL EDX, 12*/
+                addbyte(0xe2);
+                addbyte(12);
+                addbyte(0xd3); /*SHR EBX, CL*/
+                addbyte(0xeb);
+                addbyte(0x81); /*AND EBX, 0xfff - EBX = mant*/
+                addbyte(0xe3);
+                addlong(0xfff);
+                addbyte(0x8d); /*LEA EAX, 1[EDX, EBX]*/
+                addbyte(0x44);
+                addbyte(0x13);
+                addbyte(1);
+                addbyte(0xbb); /*MOV EBX, 0xffff*/
+                addlong(0xffff);
+                addbyte(0x39); /*CMP EAX, EBX*/
+                addbyte(0xd8);
+                addbyte(0x0f); /*CMOVA EAX, EBX*/
+                addbyte(0x47);
+                addbyte(0xc3);
+
+                if (depth_jump_pos)
+                        *(uint8_t *)&code_block[depth_jump_pos] = (block_pos - depth_jump_pos) - 1;
+                if (depth_jump_pos)
+                        *(uint8_t *)&code_block[depth_jump_pos2] = (block_pos - depth_jump_pos2) - 1;
+                
+                if ((params->fogMode & (FOG_ENABLE|FOG_CONSTANT|FOG_Z|FOG_ALPHA)) == FOG_ENABLE)
+                {
+                        addbyte(0x89); /*MOV state->w_depth[EDI], EAX*/
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, w_depth));
+                }
+        }
+        if (!(params->fbzMode & FBZ_W_BUFFER))
+        {
+                addbyte(0x8b); /*MOV EAX, z*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, z));
+                addbyte(0xbb); /*MOV EBX, 0xffff*/
+                addlong(0xffff);
+                addbyte(0x31); /*XOR ECX, ECX*/
+                addbyte(0xc9);
+                addbyte(0xc1); /*SAR EAX, 12*/
+                addbyte(0xf8);
+                addbyte(12);
+                addbyte(0x0f); /*CMOVS EAX, ECX*/
+                addbyte(0x48);
+                addbyte(0xc1);
+                addbyte(0x39); /*CMP EAX, EBX*/
+                addbyte(0xd8);
+                addbyte(0x0f); /*CMOVA EAX, EBX*/
+                addbyte(0x47);
+                addbyte(0xc3);
+        }
+        
+        if (params->fbzMode & FBZ_DEPTH_BIAS)
+        {
+                addbyte(0x0f); /*MOVSX EDX, params->zaColor[ESI]*/
+                addbyte(0xbf);
+                addbyte(0x96);
+                addlong(offsetof(voodoo_params_t, zaColor));                
+                if (params->fbzMode & FBZ_W_BUFFER)
+                {
+                        addbyte(0xbb); /*MOV EBX, 0xffff*/
+                        addlong(0xffff);
+                        addbyte(0x31); /*XOR ECX, ECX*/
+                        addbyte(0xc9);
+                }
+                addbyte(0x01); /*ADD EAX, EDX*/
+                addbyte(0xd0);
+                addbyte(0x0f); /*CMOVS EAX, ECX*/
+                addbyte(0x48);
+                addbyte(0xc1);
+                addbyte(0x39); /*CMP EAX, EBX*/
+                addbyte(0xd8);
+                addbyte(0x0f); /*CMOVA EAX, EBX*/
+                addbyte(0x47);
+                addbyte(0xc3);
+        }
+
+        addbyte(0x89); /*MOV state->new_depth[EDI], EAX*/
+        addbyte(0x87);
+        addlong(offsetof(voodoo_state_t, new_depth));
+
+        if ((params->fbzMode & FBZ_DEPTH_ENABLE) && (depthop != DEPTHOP_ALWAYS) && (depthop != DEPTHOP_NEVER))
+        {
+                addbyte(0x8b); /*MOV EBX, state->x[EDI]*/
+                addbyte(0x9f);
+                if (params->aux_tiled)
+                        addlong(offsetof(voodoo_state_t, x_tiled));
+                else
+                        addlong(offsetof(voodoo_state_t, x));
+                addbyte(0x8b);/*MOV ECX, aux_mem[EDI]*/
+                addbyte(0x8f);
+                addlong(offsetof(voodoo_state_t, aux_mem));
+                addbyte(0x0f); /*MOVZX EBX, [ECX+EBX*2]*/
+                addbyte(0xb7);
+                addbyte(0x1c);
+                addbyte(0x59);
+                if (params->fbzMode & FBZ_DEPTH_SOURCE)
+                {
+                        addbyte(0x0f); /*MOVZX EAX, zaColor[ESI]*/
+                        addbyte(0xb7);
+                        addbyte(0x86);
+                        addlong(offsetof(voodoo_params_t, zaColor));
+                }
+                addbyte(0x39); /*CMP EAX, EBX*/
+                addbyte(0xd8);
+                if (depthop == DEPTHOP_LESSTHAN)
+                {
+                        addbyte(0x0f); /*JAE skip*/
+                        addbyte(0x83);
+                        z_skip_pos = block_pos;
+                        addlong(0);
+                }
+                else if (depthop == DEPTHOP_EQUAL)
+                {
+                        addbyte(0x0f); /*JNE skip*/
+                        addbyte(0x85);
+                        z_skip_pos = block_pos;
+                        addlong(0);
+                }
+                else if (depthop == DEPTHOP_LESSTHANEQUAL)
+                {
+                        addbyte(0x0f); /*JA skip*/
+                        addbyte(0x87);
+                        z_skip_pos = block_pos;
+                        addlong(0);
+                }
+                else if (depthop == DEPTHOP_GREATERTHAN)
+                {
+                        addbyte(0x0f); /*JBE skip*/
+                        addbyte(0x86);
+                        z_skip_pos = block_pos;
+                        addlong(0);
+                }
+                else if (depthop == DEPTHOP_NOTEQUAL)
+                {
+                        addbyte(0x0f); /*JE skip*/
+                        addbyte(0x84);
+                        z_skip_pos = block_pos;
+                        addlong(0);
+                }
+                else if (depthop == DEPTHOP_GREATERTHANEQUAL)
+                {
+                        addbyte(0x0f); /*JB skip*/
+                        addbyte(0x82);
+                        z_skip_pos = block_pos;
+                        addlong(0);
+                }
+                else
+                        fatal("Bad depth_op\n");
+        }
+        else if ((params->fbzMode & FBZ_DEPTH_ENABLE) && (depthop == DEPTHOP_NEVER))
+        {
+                addbyte(0xC3); /*RET*/
+//                addbyte(0x30); /*XOR EAX, EAX*/
+//                addbyte(0xc0);
+        }
+//        else
+//        {
+//                addbyte(0xb0); /*MOV AL, 1*/
+//                addbyte(1);
+//        }
+
+
+//        voodoo_combine = &code_block[block_pos];
+        /*XMM0 = colour*/
+        /*XMM2 = 0 (for unpacking*/
+        
+        /*EDI = state, ESI = params*/
+
+        if ((params->textureMode[0] & TEXTUREMODE_LOCAL_MASK) == TEXTUREMODE_LOCAL || !voodoo->dual_tmus)
+        {
+                /*TMU0 only sampling local colour or only one TMU, only sample TMU0*/
+                block_pos = codegen_texture_fetch(code_block, voodoo, params, state, block_pos, 0);
+                
+                addbyte(0x66); /*MOVD XMM0, EAX*/
+                addbyte(0x0f);
+                addbyte(0x6e);
+                addbyte(0xc0);
+                addbyte(0xc1); /*SHR EAX, 24*/
+                addbyte(0xe8);
+                addbyte(24);
+                addbyte(0x89); /*MOV state->tex_a[EDI], EAX*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, tex_a));
+        }
+        else if ((params->textureMode[0] & TEXTUREMODE_MASK) == TEXTUREMODE_PASSTHROUGH)
+        {
+                /*TMU0 in pass-through mode, only sample TMU1*/
+                block_pos = codegen_texture_fetch(code_block, voodoo, params, state, block_pos, 1);
+                
+                addbyte(0x66); /*MOVD XMM0, EAX*/
+                addbyte(0x0f);
+                addbyte(0x6e);
+                addbyte(0xc0);
+                addbyte(0xc1); /*SHR EAX, 24*/
+                addbyte(0xe8);
+                addbyte(24);
+                addbyte(0x89); /*MOV state->tex_a[EDI], EAX*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, tex_a));
+        }
+        else
+        {
+                block_pos = codegen_texture_fetch(code_block, voodoo, params, state, block_pos, 1);
+
+                addbyte(0x66); /*MOVD XMM3, EAX*/
+                addbyte(0x0f);
+                addbyte(0x6e);
+                addbyte(0xd8);
+                if ((params->textureMode[1] & TEXTUREMODE_TRILINEAR) && tc_sub_clocal_1)
+                {
+                        addbyte(0x8b); /*MOV EAX, state->lod*/
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, lod));
+                        if (!tc_reverse_blend_1)
+                        {
+                                addbyte(0xbb); /*MOV EBX, 1*/
+                                addlong(1);
+                        }
+                        else
+                        {
+                                addbyte(0x31); /*XOR EBX, EBX*/
+                                addbyte(0xdb);
+                        }
+                        addbyte(0x83); /*AND EAX, 1*/
+                        addbyte(0xe0);
+                        addbyte(1);
+                        if (!tca_reverse_blend_1)
+                        {
+                                addbyte(0xb9); /*MOV ECX, 1*/
+                                addlong(1);
+                        }
+                        else
+                        {
+                                addbyte(0x31); /*XOR ECX, ECX*/
+                                addbyte(0xc9);
+                        }
+                        addbyte(0x31); /*XOR EBX, EAX*/
+                        addbyte(0xc3);
+                        addbyte(0x31); /*XOR ECX, EAX*/
+                        addbyte(0xc1);
+                        addbyte(0xc1); /*SHL EBX, 4*/
+                        addbyte(0xe3);
+                        addbyte(4);
+                        /*EBX = tc_reverse_blend, ECX=tca_reverse_blend*/
+                }
+                addbyte(0x66); /*PUNPCKLBW XMM3, XMM2*/
+                addbyte(0x0f);
+                addbyte(0x60);
+                addbyte(0xda);
+                if (tc_sub_clocal_1)
+                {
+                        switch (tc_mselect_1)
+                        {
+                                case TC_MSELECT_ZERO:
+                                addbyte(0x66); /*PXOR XMM0, XMM0*/
+                                addbyte(0x0f);
+                                addbyte(0xef);
+                                addbyte(0xc0);
+                                break;
+                                case TC_MSELECT_CLOCAL:
+                                addbyte(0xf3); /*MOVQ XMM0, XMM3*/
+                                addbyte(0x0f);
+                                addbyte(0x7e);
+                                addbyte(0xc3);
+                                break;
+                                case TC_MSELECT_AOTHER:
+                                addbyte(0x66); /*PXOR XMM0, XMM0*/
+                                addbyte(0x0f);
+                                addbyte(0xef);
+                                addbyte(0xc0);
+                                break;
+                                case TC_MSELECT_ALOCAL:
+                                addbyte(0xf2); /*PSHUFLW XMM0, XMM3, 0xff*/
+                                addbyte(0x0f);
+                                addbyte(0x70);
+                                addbyte(0xc3);
+                                addbyte(0xff);
+                                break;
+                                case TC_MSELECT_DETAIL:
+                                addbyte(0xb8); /*MOV EAX, params->detail_bias[1]*/
+                                addlong(params->detail_bias[1]);
+                                addbyte(0x2b); /*SUB EAX, state->lod*/
+                                addbyte(0x87);
+                                addlong(offsetof(voodoo_state_t, lod));
+                                addbyte(0xba); /*MOV EDX, params->detail_max[1]*/
+                                addlong(params->detail_max[1]);
+                                addbyte(0xc1); /*SHL EAX, params->detail_scale[1]*/
+                                addbyte(0xe0);
+                                addbyte(params->detail_scale[1]);
+                                addbyte(0x39); /*CMP EAX, EDX*/
+                                addbyte(0xd0);
+                                addbyte(0x0f); /*CMOVNL EAX, EDX*/
+                                addbyte(0x4d);
+                                addbyte(0xc2);
+                                addbyte(0x66); /*MOVD XMM0, EAX*/
+                                addbyte(0x0f);
+                                addbyte(0x6e);
+                                addbyte(0xc0);
+                                addbyte(0xf2); /*PSHUFLW XMM0, XMM0, 0*/
+                                addbyte(0x0f);
+                                addbyte(0x70);
+                                addbyte(0xc0);
+                                addbyte(0);
+                                break;
+                                case TC_MSELECT_LOD_FRAC:
+                                addbyte(0x66); /*MOVD XMM0, state->lod_frac[1]*/
+                                addbyte(0x0f);
+                                addbyte(0x6e);
+                                addbyte(0x87);
+                                addlong(offsetof(voodoo_state_t, lod_frac[1]));
+                                addbyte(0xf2); /*PSHUFLW XMM0, XMM0, 0*/
+                                addbyte(0x0f);
+                                addbyte(0x70);
+                                addbyte(0xc0);
+                                addbyte(0);
+                                break;
+                        }
+                        if (params->textureMode[1] & TEXTUREMODE_TRILINEAR)
+                        {
+                                addbyte(0x66); /*PXOR XMM0, xmm_00_ff_w[EBX]*/
+                                addbyte(0x0f);
+                                addbyte(0xef);
+                                addbyte(0x83);
+                                addlong((uint32_t)&xmm_00_ff_w[0]);
+                        }
+                        else if (!tc_reverse_blend_1)
+                        {
+                                addbyte(0x66); /*PXOR XMM0, xmm_ff_w*/
+                                addbyte(0x0f);
+                                addbyte(0xef);
+                                addbyte(0x05);
+                                addlong((uint32_t)&xmm_ff_w);
+                        }
+                        addbyte(0x66); /*PADD XMM0, xmm_01_w*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0x05);
+                        addlong((uint32_t)&xmm_01_w);
+                        addbyte(0xf3); /*MOVQ XMM1, XMM2*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xca);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xe8);
+                        addbyte(0x66); /*PMULLW XMM0, XMM3*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0xc3);
+                        addbyte(0x66); /*PMULHW XMM5, XMM3*/
+                        addbyte(0x0f);
+                        addbyte(0xe5);
+                        addbyte(0xeb);
+                        addbyte(0x66); /*PUNPCKLWD XMM0, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0x61);
+                        addbyte(0xc5);
+                        addbyte(0x66); /*PSRAD XMM0, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x72);
+                        addbyte(0xe0);
+                        addbyte(8);
+                        addbyte(0x66); /*PACKSSDW XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x6b);
+                        addbyte(0xc0);
+                        addbyte(0x66); /*PSUBW XMM1, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0xf9);
+                        addbyte(0xc8);
+                        if (tc_add_clocal_1)
+                        {
+                                addbyte(0x66); /*PADDW XMM1, XMM3*/
+                                addbyte(0x0f);
+                                addbyte(0xfd);
+                                addbyte(0xcb);
+                        }
+                        else if (tc_add_alocal_1)
+                        {
+                                addbyte(0xf2); /*PSHUFLW XMM0, XMM3, 0xff*/
+                                addbyte(0x0f);
+                                addbyte(0x70);
+                                addbyte(0xc3);
+                                addbyte(0xff);
+                                addbyte(0x66); /*PADDW XMM1, XMM0*/
+                                addbyte(0x0f);
+                                addbyte(0xfd);
+                                addbyte(0xc8);
+                        }
+                        addbyte(0xf3); /*MOVD XMM3, XMM1*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xd9);
+                        addbyte(0x66); /*PACKUSWB XMM3, XMM3*/
+                        addbyte(0x0f);
+                        addbyte(0x67);
+                        addbyte(0xdb);
+                        if (tca_sub_clocal_1)
+                        {
+                                addbyte(0x66); /*MOVD EBX, XMM3*/
+                                addbyte(0x0f);
+                                addbyte(0x7e);
+                                addbyte(0xdb);
+                        }
+                        addbyte(0x66); /*PUNPCKLBW XMM3, XMM2*/
+                        addbyte(0x0f);
+                        addbyte(0x60);
+                        addbyte(0xda);
+                }
+
+                if (tca_sub_clocal_1)
+                {
+                        addbyte(0xc1); /*SHR EBX, 24*/
+                        addbyte(0xeb);
+                        addbyte(24);
+                        switch (tca_mselect_1)
+                        {
+                                case TCA_MSELECT_ZERO:
+                                addbyte(0x31); /*XOR EAX, EAX*/
+                                addbyte(0xc0);
+                                break;
+                                case TCA_MSELECT_CLOCAL:
+                                addbyte(0x89); /*MOV EAX, EBX*/
+                                addbyte(0xd8);
+                                break;
+                                case TCA_MSELECT_AOTHER:
+                                addbyte(0x31); /*XOR EAX, EAX*/
+                                addbyte(0xc0);
+                                break;
+                                case TCA_MSELECT_ALOCAL:
+                                addbyte(0x89); /*MOV EAX, EBX*/
+                                addbyte(0xd8);
+                                break;
+                                case TCA_MSELECT_DETAIL:
+                                addbyte(0xb8); /*MOV EAX, params->detail_bias[1]*/
+                                addlong(params->detail_bias[1]);
+                                addbyte(0x2b); /*SUB EAX, state->lod*/
+                                addbyte(0x87);
+                                addlong(offsetof(voodoo_state_t, lod));
+                                addbyte(0xba); /*MOV EDX, params->detail_max[1]*/
+                                addlong(params->detail_max[1]);
+                                addbyte(0xc1); /*SHL EAX, params->detail_scale[1]*/
+                                addbyte(0xe0);
+                                addbyte(params->detail_scale[1]);
+                                addbyte(0x39); /*CMP EAX, EDX*/
+                                addbyte(0xd0);
+                                addbyte(0x0f); /*CMOVNL EAX, EDX*/
+                                addbyte(0x4d);
+                                addbyte(0xc2);
+                                break;
+                                case TCA_MSELECT_LOD_FRAC:
+                                addbyte(0x8b); /*MOV EAX, state->lod_frac[1]*/
+                                addbyte(0x87);
+                                addlong(offsetof(voodoo_state_t, lod_frac[1]));
+                                break;
+                        }
+                        if (params->textureMode[1] & TEXTUREMODE_TRILINEAR)
+                        {
+                                addbyte(0x33); /*XOR EAX, i_00_ff_w[ECX*4]*/
+                                addbyte(0x04);
+                                addbyte(0x8d);
+                                addlong((uint32_t)i_00_ff_w);
+                        }
+                        else if (!tc_reverse_blend_1)
+                        {
+                                addbyte(0x35); /*XOR EAX, 0xff*/
+                                addlong(0xff);
+                        }
+                        addbyte(0x83); /*ADD EAX, 1*/
+                        addbyte(0xc0);
+                        addbyte(1);
+                        addbyte(0x0f); /*IMUL EAX, EBX*/
+                        addbyte(0xaf);
+                        addbyte(0xc3);
+                        addbyte(0xb9); /*MOV ECX, 0xff*/
+                        addlong(0xff);
+                        addbyte(0xf7); /*NEG EAX*/
+                        addbyte(0xd8);
+                        addbyte(0xc1); /*SAR EAX, 8*/
+                        addbyte(0xf8);
+                        addbyte(8);
+                        if (tca_add_clocal_1 || tca_add_alocal_1)
+                        {
+                                addbyte(0x01); /*ADD EAX, EBX*/
+                                addbyte(0xd8);
+                        }
+                        addbyte(0x39); /*CMP ECX, EAX*/
+                        addbyte(0xc1);
+                        addbyte(0x0f); /*CMOVA ECX, EAX*/
+                        addbyte(0x47);
+                        addbyte(0xc8);
+                        addbyte(0x66); /*PINSRW 3, XMM3, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0xc4);
+                        addbyte(0xd8);
+                        addbyte(3);
+                }
+        
+                block_pos = codegen_texture_fetch(code_block, voodoo, params, state, block_pos, 0);
+
+                addbyte(0x66); /*MOVD XMM0, EAX*/
+                addbyte(0x0f);
+                addbyte(0x6e);
+                addbyte(0xc0);
+                addbyte(0x66); /*MOVD XMM7, EAX*/
+                addbyte(0x0f);
+                addbyte(0x6e);
+                addbyte(0xf8);
+        
+                if (params->textureMode[0] & TEXTUREMODE_TRILINEAR)
+                {
+                        addbyte(0x8b); /*MOV EAX, state->lod*/
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, lod));
+                        if (!tc_reverse_blend)
+                        {
+                                addbyte(0xbb); /*MOV EBX, 1*/
+                                addlong(1);
+                        }
+                        else
+                        {
+                                addbyte(0x31); /*XOR EBX, EBX*/
+                                addbyte(0xdb);
+                        }
+                        addbyte(0x83); /*AND EAX, 1*/
+                        addbyte(0xe0);
+                        addbyte(1);
+                        if (!tca_reverse_blend)
+                        {
+                                addbyte(0xb9); /*MOV ECX, 1*/
+                                addlong(1);
+                        }
+                        else
+                        {
+                                addbyte(0x31); /*XOR ECX, ECX*/
+                                addbyte(0xc9);
+                        }
+                        addbyte(0x31); /*XOR EBX, EAX*/
+                        addbyte(0xc3);
+                        addbyte(0x31); /*XOR ECX, EAX*/
+                        addbyte(0xc1);
+                        addbyte(0xc1); /*SHL EBX, 4*/
+                        addbyte(0xe3);
+                        addbyte(4);
+                        /*EBX = tc_reverse_blend, ECX=tca_reverse_blend*/
+                }
+
+                /*XMM0 = TMU0 output, XMM3 = TMU1 output*/
+
+                addbyte(0x66); /*PUNPCKLBW XMM0, XMM2*/
+                addbyte(0x0f);
+                addbyte(0x60);
+                addbyte(0xc2);
+                if (tc_zero_other)
+                {
+                        addbyte(0x66); /*PXOR XMM1, XMM1*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xc9);
+                }
+                else
+                {
+                        addbyte(0xf3); /*MOV XMM1, XMM3*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xcb);
+                }
+                if (tc_sub_clocal)
+                {
+                        addbyte(0x66); /*PSUBW XMM1, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0xf9);
+                        addbyte(0xc8);
+                }
+
+                switch (tc_mselect)
+                {
+                        case TC_MSELECT_ZERO:
+                        addbyte(0x66); /*PXOR XMM4, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xe4);
+                        break;
+                        case TC_MSELECT_CLOCAL:
+                        addbyte(0xf3); /*MOV XMM4, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xe0);
+                        break;
+                        case TC_MSELECT_AOTHER:
+                        addbyte(0xf2); /*PSHUFLW XMM4, XMM3, 3, 3, 3, 3*/
+                        addbyte(0x0f);
+                        addbyte(0x70);
+                        addbyte(0xe3);
+                        addbyte(0xff);
+                        break;
+                        case TC_MSELECT_ALOCAL:
+                        addbyte(0xf2); /*PSHUFLW XMM4, XMM0, 3, 3, 3, 3*/
+                        addbyte(0x0f);
+                        addbyte(0x70);
+                        addbyte(0xe0);
+                        addbyte(0xff);
+                        break;
+                        case TC_MSELECT_DETAIL:
+                        addbyte(0xb8); /*MOV EAX, params->detail_bias[0]*/
+                        addlong(params->detail_bias[0]);
+                        addbyte(0x2b); /*SUB EAX, state->lod*/
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, lod));
+                        addbyte(0xba); /*MOV EDX, params->detail_max[0]*/
+                        addlong(params->detail_max[0]);
+                        addbyte(0xc1); /*SHL EAX, params->detail_scale[0]*/
+                        addbyte(0xe0);
+                        addbyte(params->detail_scale[0]);
+                        addbyte(0x39); /*CMP EAX, EDX*/
+                        addbyte(0xd0);
+                        addbyte(0x0f); /*CMOVNL EAX, EDX*/
+                        addbyte(0x4d);
+                        addbyte(0xc2);
+                        addbyte(0x66); /*MOVD XMM4, EAX*/
+                        addbyte(0x0f);
+                        addbyte(0x6e);
+                        addbyte(0xe0);
+                        addbyte(0xf2); /*PSHUFLW XMM4, XMM4, 0*/
+                        addbyte(0x0f);
+                        addbyte(0x70);
+                        addbyte(0xe4);
+                        addbyte(0);
+                        break;
+                        case TC_MSELECT_LOD_FRAC:
+                        addbyte(0x66); /*MOVD XMM0, state->lod_frac[0]*/
+                        addbyte(0x0f);
+                        addbyte(0x6e);
+                        addbyte(0xa7);
+                        addlong(offsetof(voodoo_state_t, lod_frac[0]));
+                        addbyte(0xf2); /*PSHUFLW XMM0, XMM0, 0*/
+                        addbyte(0x0f);
+                        addbyte(0x70);
+                        addbyte(0xe4);
+                        addbyte(0);
+                        break;
+                }
+                if (params->textureMode[0] & TEXTUREMODE_TRILINEAR)
+                {
+                        addbyte(0x66); /*PXOR XMM4, xmm_00_ff_w[EBX]*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xa3);
+                        addlong((uint32_t)&xmm_00_ff_w[0]);
+                }
+                else if (!tc_reverse_blend)
+                {
+                        addbyte(0x66); /*PXOR XMM4, FF*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0x25);
+                        addlong((uint32_t)&xmm_ff_w);
+                }
+                addbyte(0x66); /*PADDW XMM4, 1*/
+                addbyte(0x0f);
+                addbyte(0xfd);
+                addbyte(0x25);
+                addlong((uint32_t)&xmm_01_w);
+                addbyte(0xf3); /*MOVQ XMM5, XMM1*/
+                addbyte(0x0f);
+                addbyte(0x7e);
+                addbyte(0xe9);
+                addbyte(0x66); /*PMULLW XMM1, XMM4*/
+                addbyte(0x0f);
+                addbyte(0xd5);
+                addbyte(0xcc);
+
+                if (tca_sub_clocal)
+                {
+                        addbyte(0x66); /*MOV EBX, XMM7*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xfb);
+                }
+
+                addbyte(0x66); /*PMULHW XMM5, XMM4*/
+                addbyte(0x0f);
+                addbyte(0xe5);
+                addbyte(0xec);
+                addbyte(0x66); /*PUNPCKLWD XMM1, XMM5*/
+                addbyte(0x0f);
+                addbyte(0x61);
+                addbyte(0xcd);
+                addbyte(0x66); /*PSRAD XMM1, 8*/
+                addbyte(0x0f);
+                addbyte(0x72);
+                addbyte(0xe1);
+                addbyte(8);
+                addbyte(0x66); /*PACKSSDW XMM1, XMM1*/
+                addbyte(0x0f);
+                addbyte(0x6b);
+                addbyte(0xc9);
+
+                if (tca_sub_clocal)
+                {
+                        addbyte(0xc1); /*SHR EBX, 24*/
+                        addbyte(0xeb);
+                        addbyte(24);
+                }
+                
+                if (tc_add_clocal)
+                {
+                        addbyte(0x66); /*PADDW XMM1, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xc8);
+                }
+                else if (tc_add_alocal)
+                {
+                        addbyte(0xf2); /*PSHUFLW XMM4, XMM0, 3, 3, 3, 3*/
+                        addbyte(0x0f);
+                        addbyte(0x70);
+                        addbyte(0xe0);
+                        addbyte(0xff);
+                        addbyte(0x66); /*PADDW XMM1, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xcc);
+                }
+        
+                addbyte(0x66); /*PACKUSWB XMM0, XMM0*/
+                addbyte(0x0f);
+                addbyte(0x67);
+                addbyte(0xc0);
+                addbyte(0x66); /*PACKUSWB XMM3, XMM3*/
+                addbyte(0x0f);
+                addbyte(0x67);
+                addbyte(0xdb);
+                addbyte(0x66); /*PACKUSWB XMM1, XMM1*/
+                addbyte(0x0f);
+                addbyte(0x67);
+                addbyte(0xc9);
+                if (tc_invert_output)
+                {
+                        addbyte(0x66); /*PXOR XMM1, FF*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0x0d);
+                        addlong((uint32_t)&xmm_ff_b);
+                }
+        
+                if (tca_zero_other)
+                {
+                        addbyte(0x31); /*XOR EAX, EAX*/
+                        addbyte(0xc0);
+                }
+                else
+                {
+                        addbyte(0x66); /*MOV EAX, XMM3*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xd8);
+                        addbyte(0xc1); /*SHR EAX, 24*/
+                        addbyte(0xe8);
+                        addbyte(24);
+                }
+                if (tca_sub_clocal)
+                {
+                        addbyte(0x29); /*SUB EAX, EBX*/
+                        addbyte(0xd8);
+                }
+                switch (tca_mselect)
+                {
+                        case TCA_MSELECT_ZERO:
+                        addbyte(0x31); /*XOR EBX, EBX*/
+                        addbyte(0xdb);
+                        break;
+                        case TCA_MSELECT_CLOCAL:
+                        addbyte(0x66); /*MOV EBX, XMM7*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xfb);
+                        addbyte(0xc1); /*SHR EBX, 24*/
+                        addbyte(0xeb);
+                        addbyte(24);
+                        break;
+                        case TCA_MSELECT_AOTHER:
+                        addbyte(0x66); /*MOV EBX, XMM3*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xdb);
+                        addbyte(0xc1); /*SHR EBX, 24*/
+                        addbyte(0xeb);
+                        addbyte(24);
+                        break;
+                        case TCA_MSELECT_ALOCAL:
+                        addbyte(0x66); /*MOV EBX, XMM7*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xfb);
+                        addbyte(0xc1); /*SHR EBX, 24*/
+                        addbyte(0xeb);
+                        addbyte(24);
+                        break;
+                        case TCA_MSELECT_DETAIL:
+                        addbyte(0xbb); /*MOV EBX, params->detail_bias[1]*/
+                        addlong(params->detail_bias[1]);
+                        addbyte(0x2b); /*SUB EBX, state->lod*/
+                        addbyte(0x9f);
+                        addlong(offsetof(voodoo_state_t, lod));
+                        addbyte(0xba); /*MOV EDX, params->detail_max[1]*/
+                        addlong(params->detail_max[1]);
+                        addbyte(0xc1); /*SHL EBX, params->detail_scale[1]*/
+                        addbyte(0xe3);
+                        addbyte(params->detail_scale[1]);
+                        addbyte(0x39); /*CMP EBX, EDX*/
+                        addbyte(0xd3);
+                        addbyte(0x0f); /*CMOVNL EBX, EDX*/
+                        addbyte(0x4d);
+                        addbyte(0xda);
+                        break;
+                        case TCA_MSELECT_LOD_FRAC:
+                        addbyte(0x8b); /*MOV EBX, state->lod_frac[0]*/
+                        addbyte(0x9f);
+                        addlong(offsetof(voodoo_state_t, lod_frac[0]));
+                        break;
+                }
+                if (params->textureMode[0] & TEXTUREMODE_TRILINEAR)
+                {
+                        addbyte(0x33); /*XOR EBX, i_00_ff_w[ECX*4]*/
+                        addbyte(0x1c);
+                        addbyte(0x8d);
+                        addlong((uint32_t)i_00_ff_w);
+                }
+                else if (!tca_reverse_blend)
+                {
+                        addbyte(0x81); /*XOR EBX, 0xFF*/
+                        addbyte(0xf3);
+                        addlong(0xff);
+                }
+
+                addbyte(0x83); /*ADD EBX, 1*/
+                addbyte(0xc3);
+                addbyte(1);
+                addbyte(0x0f); /*IMUL EAX, EBX*/
+                addbyte(0xaf);
+                addbyte(0xc3);
+                addbyte(0x31); /*XOR EDX, EDX*/
+                addbyte(0xd2);
+                addbyte(0xc1); /*SAR EAX, 8*/
+                addbyte(0xf8);
+                addbyte(8);
+                if (tca_add_clocal || tca_add_alocal)
+                {
+                        addbyte(0x66); /*MOV EBX, XMM7*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xfb);
+                        addbyte(0xc1); /*SHR EBX, 24*/
+                        addbyte(0xeb);
+                        addbyte(24);
+                        addbyte(0x01); /*ADD EAX, EBX*/
+                        addbyte(0xd8);
+                }
+                addbyte(0x0f); /*CMOVS EAX, EDX*/
+                addbyte(0x48);
+                addbyte(0xc2);
+                addbyte(0xba); /*MOV EDX, 0xff*/
+                addlong(0xff);
+                addbyte(0x3d); /*CMP EAX, 0xff*/
+                addlong(0xff);
+                addbyte(0x0f); /*CMOVA EAX, EDX*/
+                addbyte(0x47);
+                addbyte(0xc2);
+                if (tca_invert_output)
+                {
+                        addbyte(0x35); /*XOR EAX, 0xff*/
+                        addlong(0xff);
+                }
+
+                addbyte(0x89); /*MOV state->tex_a[EDI], EAX*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, tex_a));
+
+                addbyte(0xf3); /*MOVQ XMM0, XMM1*/
+                addbyte(0x0f);
+                addbyte(0x7e);
+                addbyte(0xc1);
+        }
+        if (cc_mselect == CC_MSELECT_TEXRGB)
+        {
+                addbyte(0xf3); /*MOVD XMM4, XMM0*/
+                addbyte(0x0f);
+                addbyte(0x7e);
+                addbyte(0xe0);
+        }
+
+        if ((params->fbzMode & FBZ_CHROMAKEY))
+        {
+                switch (_rgb_sel)
+                {
+                        case CC_LOCALSELECT_ITER_RGB:
+                        addbyte(0xf3); /*MOVDQU XMM0, ib*/ /* ir, ig and ib must be in same dqword!*/
+                        addbyte(0x0f);
+                        addbyte(0x6f);
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, ib));
+                        addbyte(0x66); /*PSRAD XMM0, 12*/
+                        addbyte(0x0f);
+                        addbyte(0x72);
+                        addbyte(0xe0);
+                        addbyte(12);
+                        addbyte(0x66); /*PACKSSDW XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x6b);
+                        addbyte(0xc0);
+                        addbyte(0x66); /*PACKUSWB XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x67);
+                        addbyte(0xc0);
+                        addbyte(0x66); /*MOVD EAX, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xc0);
+                        break;
+                        case CC_LOCALSELECT_COLOR1:
+                        addbyte(0x8b); /*MOV EAX, params->color1[ESI]*/
+                        addbyte(0x86);
+                        addlong(offsetof(voodoo_params_t, color1));
+                        break;
+                        case CC_LOCALSELECT_TEX:
+                        addbyte(0x66); /*MOVD EAX, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xc0);
+                        break;
+                }
+                addbyte(0x8b); /*MOV EBX, params->chromaKey[ESI]*/
+                addbyte(0x9e);
+                addlong(offsetof(voodoo_params_t, chromaKey));
+                addbyte(0x31); /*XOR EBX, EAX*/
+                addbyte(0xc3);
+                addbyte(0x81); /*AND EBX, 0xffffff*/
+                addbyte(0xe3);
+                addlong(0xffffff);
+                addbyte(0x0f); /*JE skip*/
+                addbyte(0x84);
+                chroma_skip_pos = block_pos;
+                addlong(0);
+        }
+
+        if (voodoo->trexInit1[0] & (1 << 18))
+        {
+                addbyte(0xb8); /*MOV EAX, tmuConfig*/
+                addlong(voodoo->tmuConfig);
+                addbyte(0x66); /*MOVD XMM0, EAX*/
+                addbyte(0x0f);
+                addbyte(0x6e);
+                addbyte(0xc0);
+        }
+
+        if ((params->alphaMode & ((1 << 0) | (1 << 4))) || (!(cc_mselect == 0 && cc_reverse_blend == 0) && (cc_mselect == CC_MSELECT_AOTHER || cc_mselect == CC_MSELECT_ALOCAL)))
+        {
+                /*EBX = a_other*/
+                switch (a_sel)
+                {
+                        case A_SEL_ITER_A:
+                        addbyte(0x8b); /*MOV EBX, state->ia*/
+                        addbyte(0x9f);
+                        addlong(offsetof(voodoo_state_t, ia));
+                        addbyte(0x31); /*XOR EAX, EAX*/
+                        addbyte(0xc0);
+                        addbyte(0xba); /*MOV EDX, 0xff*/
+                        addlong(0xff);
+                        addbyte(0xc1); /*SAR EBX, 12*/
+                        addbyte(0xfb);
+                        addbyte(12);
+                        addbyte(0x0f); /*CMOVS EBX, EAX*/
+                        addbyte(0x48);
+                        addbyte(0xd8);
+                        addbyte(0x39); /*CMP EBX, EDX*/
+                        addbyte(0xd3);
+                        addbyte(0x0f); /*CMOVA EBX, EDX*/
+                        addbyte(0x47);
+                        addbyte(0xda);
+                        break;
+                        case A_SEL_TEX:
+                        addbyte(0x8b); /*MOV EBX, state->tex_a*/
+                        addbyte(0x9f);
+                        addlong(offsetof(voodoo_state_t, tex_a));
+                        break;
+                        case A_SEL_COLOR1:
+                        addbyte(0x0f); /*MOVZX EBX, params->color1+3*/
+                        addbyte(0xb6);
+                        addbyte(0x9e);
+                        addlong(offsetof(voodoo_params_t, color1)+3);
+                        break;
+                        default:
+                        addbyte(0x31); /*XOR EBX, EBX*/
+                        addbyte(0xdb);
+                        break;
+                }
+                /*ECX = a_local*/
+                switch (cca_localselect)
+                {
+                        case CCA_LOCALSELECT_ITER_A:
+                        if (a_sel == A_SEL_ITER_A)
+                        {
+                                addbyte(0x89); /*MOV ECX, EBX*/
+                                addbyte(0xd9);
+                        }
+                        else
+                        {
+                                addbyte(0x8b); /*MOV ECX, state->ia*/
+                                addbyte(0x8f);
+                                addlong(offsetof(voodoo_state_t, ia));
+                                addbyte(0x31); /*XOR EAX, EAX*/
+                                addbyte(0xc0); 
+                                addbyte(0xba); /*MOV EDX, 0xff*/
+                                addlong(0xff);
+                                addbyte(0xc1);/*SAR ECX, 12*/
+                                addbyte(0xf9);
+                                addbyte(12);
+                                addbyte(0x0f); /*CMOVS ECX, EAX*/
+                                addbyte(0x48);
+                                addbyte(0xc8);
+                                addbyte(0x39); /*CMP ECX, EDX*/
+                                addbyte(0xd1);
+                                addbyte(0x0f); /*CMOVA ECX, EDX*/
+                                addbyte(0x47);
+                                addbyte(0xca);
+                        }
+                        break;
+                        case CCA_LOCALSELECT_COLOR0:
+                        addbyte(0x0f); /*MOVZX ECX, params->color0+3*/
+                        addbyte(0xb6);
+                        addbyte(0x8e);
+                        addlong(offsetof(voodoo_params_t, color0)+3);
+                        break;
+                        case CCA_LOCALSELECT_ITER_Z:
+                        addbyte(0x8b); /*MOV ECX, state->z*/
+                        addbyte(0x8f);
+                        addlong(offsetof(voodoo_state_t, z));
+                        if (a_sel != A_SEL_ITER_A)
+                        {
+                                addbyte(0x31); /*XOR EAX, EAX*/
+                                addbyte(0xc0); 
+                                addbyte(0xba); /*MOV EDX, 0xff*/
+                                addlong(0xff);
+                        }
+                        addbyte(0xc1);/*SAR ECX, 20*/
+                        addbyte(0xf9);
+                        addbyte(20);
+                        addbyte(0x0f); /*CMOVS ECX, EAX*/
+                        addbyte(0x48);
+                        addbyte(0xc8);
+                        addbyte(0x39); /*CMP ECX, EDX*/
+                        addbyte(0xd1);
+                        addbyte(0x0f); /*CMOVA ECX, EDX*/
+                        addbyte(0x47);
+                        addbyte(0xca);
+                        break;
+                                        
+                        default:
+                        addbyte(0xb9); /*MOV ECX, 0xff*/
+                        addlong(0xff);
+                        break;
+                }
+
+                if (cca_zero_other)
+                {
+                        addbyte(0x31); /*XOR EDX, EDX*/
+                        addbyte(0xd2);
+                }
+                else
+                {
+                        addbyte(0x89); /*MOV EDX, EBX*/
+                        addbyte(0xda);
+                }
+        
+                if (cca_sub_clocal)
+                {
+                        addbyte(0x29); /*SUB EDX, ECX*/
+                        addbyte(0xca);
+                }
+        }
+
+        if (cc_sub_clocal || cc_mselect == 1 || cc_add == 1)
+        {
+                /*XMM1 = local*/
+                if (!cc_localselect_override)
+                {
+                        if (cc_localselect)
+                        {
+                                addbyte(0x66); /*MOVD XMM1, params->color0*/
+                                addbyte(0x0f);
+                                addbyte(0x6e);
+                                addbyte(0x8e);
+                                addlong(offsetof(voodoo_params_t, color0));
+                        }
+                        else
+                        {
+                                addbyte(0xf3); /*MOVDQU XMM1, ib*/ /* ir, ig and ib must be in same dqword!*/
+                                addbyte(0x0f);
+                                addbyte(0x6f);
+                                addbyte(0x8f);
+                                addlong(offsetof(voodoo_state_t, ib));
+                                addbyte(0x66); /*PSRAD XMM1, 12*/
+                                addbyte(0x0f);
+                                addbyte(0x72);
+                                addbyte(0xe1);
+                                addbyte(12);
+                                addbyte(0x66); /*PACKSSDW XMM1, XMM1*/
+                                addbyte(0x0f);
+                                addbyte(0x6b);
+                                addbyte(0xc9);
+                                addbyte(0x66); /*PACKUSWB XMM1, XMM1*/
+                                addbyte(0x0f);
+                                addbyte(0x67);
+                                addbyte(0xc9);
+                        }
+                }
+                else
+                {
+                        addbyte(0xf6); /*TEST state->tex_a, 0x80*/
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, tex_a));
+                        addbyte(0x80);
+                        addbyte(0x74);/*JZ !cc_localselect*/
+                        addbyte(8+2);
+                                addbyte(0x66); /*MOVD XMM1, params->color0*/
+                                addbyte(0x0f);
+                                addbyte(0x6e);
+                                addbyte(0x8e);
+                                addlong(offsetof(voodoo_params_t, color0));
+                                addbyte(0xeb); /*JMP +*/
+                                addbyte(8+5+4+4);
+                        /*!cc_localselect:*/
+                                addbyte(0xf3); /*MOVDQU XMM1, ib*/ /* ir, ig and ib must be in same dqword!*/
+                                addbyte(0x0f);
+                                addbyte(0x6f);
+                                addbyte(0x8f);
+                                addlong(offsetof(voodoo_state_t, ib));
+                                addbyte(0x66); /*PSRAD XMM1, 12*/
+                                addbyte(0x0f);
+                                addbyte(0x72);
+                                addbyte(0xe1);
+                                addbyte(12);
+                                addbyte(0x66); /*PACKSSDW XMM1, XMM1*/
+                                addbyte(0x0f);
+                                addbyte(0x6b);
+                                addbyte(0xc9);
+                                addbyte(0x66); /*PACKUSWB XMM1, XMM1*/
+                                addbyte(0x0f);
+                                addbyte(0x67);
+                                addbyte(0xc9);
+                }
+                addbyte(0x66); /*PUNPCKLBW XMM1, XMM2*/
+                addbyte(0x0f);
+                addbyte(0x60);
+                addbyte(0xca);
+        }
+        if (!cc_zero_other)
+        {
+                if (_rgb_sel == CC_LOCALSELECT_ITER_RGB)
+                {
+                        addbyte(0xf3); /*MOVDQU XMM0, ib*/ /* ir, ig and ib must be in same dqword!*/
+                        addbyte(0x0f);
+                        addbyte(0x6f);
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, ib));
+                        addbyte(0x66); /*PSRAD XMM0, 12*/
+                        addbyte(0x0f);
+                        addbyte(0x72);
+                        addbyte(0xe0);
+                        addbyte(12);
+                        addbyte(0x66); /*PACKSSDW XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x6b);
+                        addbyte(0xc0);
+                        addbyte(0x66); /*PACKUSWB XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x67);
+                        addbyte(0xc0);
+                }
+                else if (_rgb_sel == CC_LOCALSELECT_TEX)
+                {
+#if 0
+                        addbyte(0xf3); /*MOVDQU XMM0, state->tex_b*/
+                        addbyte(0x0f);
+                        addbyte(0x6f);
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, tex_b));
+                        addbyte(0x66); /*PACKSSDW XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x6b);
+                        addbyte(0xc0);
+                        addbyte(0x66); /*PACKUSWB XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x67);
+                        addbyte(0xc0);
+#endif
+                }
+                else if (_rgb_sel == CC_LOCALSELECT_COLOR1)
+                {
+                        addbyte(0x66); /*MOVD XMM0, params->color1*/
+                        addbyte(0x0f);
+                        addbyte(0x6e);
+                        addbyte(0x86);
+                        addlong(offsetof(voodoo_params_t, color1));
+                }
+                else
+                {
+                        /*MOVD XMM0, src_r*/
+                }
+                addbyte(0x66); /*PUNPCKLBW XMM0, XMM2*/
+                addbyte(0x0f);
+                addbyte(0x60);
+                addbyte(0xc2);
+                if (cc_sub_clocal)
+                {
+                        addbyte(0x66); /*PSUBW XMM0, XMM1*/
+                        addbyte(0x0f);
+                        addbyte(0xf9);
+                        addbyte(0xc1);
+                }
+        }
+        else
+        {
+                addbyte(0x66); /*PXOR XMM0, XMM0*/
+                addbyte(0x0f);
+                addbyte(0xef);
+                addbyte(0xc0);
+                if (cc_sub_clocal)
+                {
+                        addbyte(0x66); /*PSUBW XMM0, XMM1*/
+                        addbyte(0x0f);
+                        addbyte(0xf9);
+                        addbyte(0xc1);
+                }
+        }
+
+        if (params->alphaMode & ((1 << 0) | (1 << 4)))
+        {
+                if (!(cca_mselect == 0 && cca_reverse_blend == 0))
+                {
+                        switch (cca_mselect)
+                        {
+                                case CCA_MSELECT_ALOCAL:
+                                addbyte(0x89); /*MOV EAX, ECX*/
+                                addbyte(0xc8);
+                                break;
+                                case CCA_MSELECT_AOTHER:
+                                addbyte(0x89); /*MOV EAX, EBX*/
+                                addbyte(0xd8);
+                                break;
+                                case CCA_MSELECT_ALOCAL2:
+                                addbyte(0x89); /*MOV EAX, ECX*/
+                                addbyte(0xc8);
+                                break;
+                                case CCA_MSELECT_TEX:
+                                addbyte(0x0f); /*MOVZX EAX, state->tex_a*/
+                                addbyte(0xb6);
+                                addbyte(0x87);
+                                addlong(offsetof(voodoo_state_t, tex_a));
+                                break;
+
+                                case CCA_MSELECT_ZERO:
+                                default:
+                                addbyte(0x31); /*XOR EAX, EAX*/
+                                addbyte(0xc0);
+                                break;
+                        }
+                        if (!cca_reverse_blend)
+                        {
+                                addbyte(0x35); /*XOR EAX, 0xff*/
+                                addlong(0xff);
+                        }
+                        addbyte(0x83); /*ADD EAX, 1*/
+                        addbyte(0xc0);
+                        addbyte(1);
+                        addbyte(0x0f); /*IMUL EDX, EAX*/
+                        addbyte(0xaf);
+                        addbyte(0xd0);
+                        addbyte(0xc1); /*SHR EDX, 8*/
+                        addbyte(0xea);
+                        addbyte(8);
+                }
+        }
+
+        if ((params->alphaMode & ((1 << 0) | (1 << 4))))
+        {
+                addbyte(0x31); /*XOR EAX, EAX*/
+                addbyte(0xc0);
+        }
+        
+        if (!(cc_mselect == 0 && cc_reverse_blend == 0) && cc_mselect == CC_MSELECT_AOTHER)
+        {
+                /*Copy a_other to XMM3 before it gets modified*/
+                addbyte(0x66); /*MOVD XMM3, EDX*/
+                addbyte(0x0f);
+                addbyte(0x6e);
+                addbyte(0xda);
+                addbyte(0xf2); /*PSHUFLW XMM3, XMM3, 0*/
+                addbyte(0x0f);
+                addbyte(0x70);
+                addbyte(0xdb);
+                addbyte(0x00);
+        }
+        
+        if (cca_add && (params->alphaMode & ((1 << 0) | (1 << 4))))
+        {
+                addbyte(0x01); /*ADD EDX, ECX*/
+                addbyte(0xca);
+        }
+        
+        if ((params->alphaMode & ((1 << 0) | (1 << 4))))
+        {
+                addbyte(0x85); /*TEST EDX, EDX*/
+                addbyte(0xd2);
+                addbyte(0x0f); /*CMOVS EDX, EAX*/
+                addbyte(0x48);
+                addbyte(0xd0);
+                addbyte(0xb8); /*MOV EAX, 0xff*/
+                addlong(0xff);
+                addbyte(0x81); /*CMP EDX, 0xff*/
+                addbyte(0xfa);
+                addlong(0xff);
+                addbyte(0x0f); /*CMOVA EDX, EAX*/
+                addbyte(0x47);
+                addbyte(0xd0);
+
+                if (cca_invert_output)
+                {
+                        addbyte(0x81); /*XOR EDX, 0xff*/
+                        addbyte(0xf2);
+                        addlong(0xff);
+                }
+        }
+
+        if (!(cc_mselect == 0 && cc_reverse_blend == 0))
+        {
+                switch (cc_mselect)
+                {
+                        case CC_MSELECT_ZERO:
+                        addbyte(0x66); /*PXOR XMM3, XMM3*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xdb);
+                        break;
+                        case CC_MSELECT_CLOCAL:
+                        addbyte(0xf3); /*MOV XMM3, XMM1*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xd9);
+                        break;
+                        case CC_MSELECT_ALOCAL:
+                        addbyte(0x66); /*MOVD XMM3, ECX*/
+                        addbyte(0x0f);
+                        addbyte(0x6e);
+                        addbyte(0xd9);
+                        addbyte(0xf2); /*PSHUFLW XMM3, XMM3, 0*/
+                        addbyte(0x0f);
+                        addbyte(0x70);
+                        addbyte(0xdb);
+                        addbyte(0x00);
+                        break;
+                        case CC_MSELECT_AOTHER:
+                        /*Handled above*/
+                        break;
+                        case CC_MSELECT_TEX:
+                        addbyte(0x66); /*PINSRW XMM3, state->tex_a, 0*/
+                        addbyte(0x0f);
+                        addbyte(0xc4);
+                        addbyte(0x9f);
+                        addlong(offsetof(voodoo_state_t, tex_a));
+                        addbyte(0);
+                        addbyte(0x66); /*PINSRW XMM3, state->tex_a, 1*/
+                        addbyte(0x0f);
+                        addbyte(0xc4);
+                        addbyte(0x9f);
+                        addlong(offsetof(voodoo_state_t, tex_a));
+                        addbyte(1);
+                        addbyte(0x66); /*PINSRW XMM3, state->tex_a, 2*/
+                        addbyte(0x0f);
+                        addbyte(0xc4);
+                        addbyte(0x9f);
+                        addlong(offsetof(voodoo_state_t, tex_a));
+                        addbyte(2);
+                        break;
+                        case CC_MSELECT_TEXRGB:
+                        addbyte(0x66); /*PUNPCKLBW XMM4, XMM2*/
+                        addbyte(0x0f);
+                        addbyte(0x60);
+                        addbyte(0xe2);
+                        addbyte(0xf3); /*MOVQ XMM3, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xdc);
+                        break;
+                        default:
+                        addbyte(0x66); /*PXOR XMM3, XMM3*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xdb);
+                        break;
+                }
+                addbyte(0xf3); /*MOV XMM4, XMM0*/
+                addbyte(0x0f);
+                addbyte(0x7e);
+                addbyte(0xe0);
+                if (!cc_reverse_blend)
+                {
+                        addbyte(0x66); /*PXOR XMM3, 0xff*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0x1d);
+                        addlong((uint32_t)&xmm_ff_w);
+                }
+                addbyte(0x66); /*PADDW XMM3, 1*/
+                addbyte(0x0f);
+                addbyte(0xfd);
+                addbyte(0x1d);
+                addlong((uint32_t)&xmm_01_w);
+                addbyte(0x66); /*PMULLW XMM0, XMM3*/
+                addbyte(0x0f);
+                addbyte(0xd5);
+                addbyte(0xc3);
+                addbyte(0x66); /*PMULHW XMM4, XMM3*/
+                addbyte(0x0f);
+                addbyte(0xe5);
+                addbyte(0xe3);
+                addbyte(0x66); /*PUNPCKLWD XMM0, XMM4*/
+                addbyte(0x0f);
+                addbyte(0x61);
+                addbyte(0xc4);
+                addbyte(0x66); /*PSRLD XMM0, 8*/
+                addbyte(0x0f);
+                addbyte(0x72);
+                addbyte(0xe0);
+                addbyte(8);
+                addbyte(0x66); /*PACKSSDW XMM0, XMM0*/
+                addbyte(0x0f);
+                addbyte(0x6b);
+                addbyte(0xc0);
+        }
+        
+        if (cc_add == 1)
+        {
+                addbyte(0x66); /*PADDW XMM0, XMM1*/
+                addbyte(0x0f);
+                addbyte(0xfd);
+                addbyte(0xc1);
+        }
+
+        addbyte(0x66); /*PACKUSWB XMM0, XMM0*/
+        addbyte(0x0f);
+        addbyte(0x67);
+        addbyte(0xc0);
+
+        if (cc_invert_output)
+        {
+                addbyte(0x66); /*PXOR XMM0, 0xff*/
+                addbyte(0x0f);
+                addbyte(0xef);
+                addbyte(0x05);
+                addlong((uint32_t)&xmm_ff_b);
+        }
+//#if 0
+//        addbyte(0x66); /*MOVD state->out[EDI], XMM0*/
+//        addbyte(0x0f);
+//        addbyte(0x7e);
+//        addbyte(0x87);
+//        addlong(offsetof(voodoo_state_t, out));
+        if (params->fogMode & FOG_ENABLE)
+        {
+                if (params->fogMode & FOG_CONSTANT)                     
+                {                                                       
+                        addbyte(0x66); /*MOVD XMM3, params->fogColor[ESI]*/
+                        addbyte(0x0f);
+                        addbyte(0x6e);
+                        addbyte(0x9e);
+                        addlong(offsetof(voodoo_params_t, fogColor));
+                        addbyte(0x66); /*PADDUSB XMM0, XMM3*/
+                        addbyte(0x0f);
+                        addbyte(0xdc);
+                        addbyte(0xc3);
+/*                        src_r += params->fogColor.r;                    
+                        src_g += params->fogColor.g;                    
+                        src_b += params->fogColor.b;                    */
+                }                                                       
+                else                                                    
+                {                                                       
+                        /*int fog_r, fog_g, fog_b, fog_a;                 */
+                                                                        
+                        addbyte(0x66); /*PUNPCKLBW XMM0, XMM2*/
+                        addbyte(0x0f);
+                        addbyte(0x60);
+                        addbyte(0xc2);
+
+                        if (!(params->fogMode & FOG_ADD))               
+                        {
+                                addbyte(0x66); /*MOVD XMM3, params->fogColor[ESI]*/
+                                addbyte(0x0f);
+                                addbyte(0x6e);
+                                addbyte(0x9e);
+                                addlong(offsetof(voodoo_params_t, fogColor));
+                                addbyte(0x66); /*PUNPCKLBW XMM3, XMM2*/
+                                addbyte(0x0f);
+                                addbyte(0x60);
+                                addbyte(0xda);
+                        }                                               
+                        else
+                        {
+                                addbyte(0x66); /*PXOR XMM3, XMM3*/
+                                addbyte(0x0f);
+                                addbyte(0xef);
+                                addbyte(0xdb);
+                        }
+                                                                        
+                        if (!(params->fogMode & FOG_MULT))
+                        {
+                                addbyte(0x66); /*PSUBW XMM3, XMM0*/
+                                addbyte(0x0f);
+                                addbyte(0xf9);
+                                addbyte(0xd8);
+                        }
+
+                        /*Divide by 2 to prevent overflow on multiply*/
+                        addbyte(0x66); /*PSRAW XMM3, 1*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xe3);
+                        addbyte(1);
+
+                        switch (params->fogMode & (FOG_Z|FOG_ALPHA))
+                        {
+                                case 0:
+                                addbyte(0x8b); /*MOV EBX, state->w_depth[EDI]*/
+                                addbyte(0x9f);
+                                addlong(offsetof(voodoo_state_t, w_depth));
+                                addbyte(0x89); /*MOV EAX, EBX*/
+                                addbyte(0xd8);
+                                addbyte(0xc1); /*SHR EBX, 10*/
+                                addbyte(0xeb);
+                                addbyte(10);
+                                addbyte(0xc1); /*SHR EAX, 2*/
+                                addbyte(0xe8);
+                                addbyte(2);
+                                addbyte(0x83); /*AND EBX, 0x3f*/
+                                addbyte(0xe3);
+                                addbyte(0x3f);
+                                addbyte(0x25); /*AND EAX, 0xff*/
+                                addlong(0xff);
+                                addbyte(0xf6); /*MUL params->fogTable+1[ESI+EBX*2]*/
+                                addbyte(0xa4);
+                                addbyte(0x5e);
+                                addlong(offsetof(voodoo_params_t, fogTable)+1);
+                                addbyte(0x0f); /*MOVZX EBX, params->fogTable[ESI+EBX*2]*/
+                                addbyte(0xb6);
+                                addbyte(0x9c);
+                                addbyte(0x5e);
+                                addlong(offsetof(voodoo_params_t, fogTable));
+                                addbyte(0xc1); /*SHR EAX, 10*/
+                                addbyte(0xe8);
+                                addbyte(10);
+                                addbyte(0x01); /*ADD EAX, EBX*/
+                                addbyte(0xd8);
+
+/*                                int fog_idx = (w_depth >> 10) & 0x3f;
+
+                                fog_a = params->fogTable[fog_idx].fog;
+                                fog_a += (params->fogTable[fog_idx].dfog * ((w_depth >> 2) & 0xff)) >> 10;*/
+                                break;
+                                
+                                case FOG_Z:
+                                addbyte(0x8b); /*MOV EAX, state->z[EDI]*/
+                                addbyte(0x87);
+                                addlong(offsetof(voodoo_state_t, z));
+                                addbyte(0xc1); /*SHR EAX, 12*/
+                                addbyte(0xe8);
+                                addbyte(12);
+                                addbyte(0x25); /*AND EAX, 0xff*/
+                                addlong(0xff);
+//                                fog_a = (z >> 20) & 0xff;
+                                break;
+                                
+                                case FOG_ALPHA:
+                                addbyte(0x8b); /*MOV EAX, state->ia[EDI]*/
+                                addbyte(0x87);
+                                addlong(offsetof(voodoo_state_t, ia));
+                                addbyte(0x31); /*XOR EBX, EBX*/
+                                addbyte(0xdb);
+                                addbyte(0xc1); /*SAR EAX, 12*/
+                                addbyte(0xf8);
+                                addbyte(12);
+                                addbyte(0x0f); /*CMOVS EAX, EBX*/
+                                addbyte(0x48);
+                                addbyte(0xc3);
+                                addbyte(0xbb); /*MOV EBX, 0xff*/
+                                addlong(0xff);
+                                addbyte(0x3d); /*CMP EAX, 0xff*/
+                                addlong(0xff);
+                                addbyte(0x0f); /*CMOVAE EAX, EBX*/
+                                addbyte(0x43);
+                                addbyte(0xc3);
+//                                fog_a = CLAMP(ia >> 12);
+                                break;
+                                
+                                case FOG_W:
+                                addbyte(0x8b); /*MOV EAX, state->w[EDI]+4*/
+                                addbyte(0x87);
+                                addlong(offsetof(voodoo_state_t, w)+4);
+                                addbyte(0x31); /*XOR EBX, EBX*/
+                                addbyte(0xdb);
+                                addbyte(0x09); /*OR EAX, EAX*/
+                                addbyte(0xc0);
+                                addbyte(0x0f); /*CMOVS EAX, EBX*/
+                                addbyte(0x48);
+                                addbyte(0xc3);
+                                addbyte(0xbb); /*MOV EBX, 0xff*/
+                                addlong(0xff);
+                                addbyte(0x3d); /*CMP EAX, 0xff*/
+                                addlong(0xff);
+                                addbyte(0x0f); /*CMOVAE EAX, EBX*/
+                                addbyte(0x43);
+                                addbyte(0xc3);
+//                                fog_a = CLAMP(w >> 32);
+                                break;
+                        }
+                        addbyte(0x01); /*ADD EAX, EAX*/
+                        addbyte(0xc0);
+//                        fog_a++;
+
+                        addbyte(0x66); /*PMULLW XMM3, alookup+4[EAX*8]*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0x1c);
+                        addbyte(0xc5);
+                        addlong(((uintptr_t)alookup) + 16);
+                        addbyte(0x66); /*PSRAW XMM3, 7*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xe3);
+                        addbyte(7);
+/*                        fog_r = (fog_r * fog_a) >> 8;
+                        fog_g = (fog_g * fog_a) >> 8;
+                        fog_b = (fog_b * fog_a) >> 8;*/
+
+                        if (params->fogMode & FOG_MULT)
+                        {
+                                addbyte(0xf3); /*MOV XMM0, XMM3*/
+                                addbyte(0x0f);
+                                addbyte(0x7e);
+                                addbyte(0xc3);
+                        }
+                        else
+                        {
+                                addbyte(0x66); /*PADDW XMM0, XMM3*/
+                                addbyte(0x0f);
+                                addbyte(0xfd);
+                                addbyte(0xc3);
+/*                                src_r += fog_r;
+                                src_g += fog_g;
+                                src_b += fog_b;*/
+                        }
+                        addbyte(0x66); /*PACKUSWB XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x67);
+                        addbyte(0xc0);
+                }
+
+/*                src_r = CLAMP(src_r);
+                src_g = CLAMP(src_g);
+                src_b = CLAMP(src_b);*/
+        }
+
+        if ((params->alphaMode & 1) && (alpha_func != AFUNC_NEVER) && (alpha_func != AFUNC_ALWAYS))
+        {
+                addbyte(0x0f); /*MOVZX ECX, params->alphaMode+3*/
+                addbyte(0xb6);
+                addbyte(0x8e);
+                addlong(offsetof(voodoo_params_t, alphaMode) + 3);
+                addbyte(0x39); /*CMP EDX, ECX*/
+                addbyte(0xca);
+
+                switch (alpha_func)
+                {
+                        case AFUNC_LESSTHAN:
+                        addbyte(0x0f); /*JAE skip*/
+                        addbyte(0x83);
+                        a_skip_pos = block_pos;
+                        addlong(0);
+                        break;
+                        case AFUNC_EQUAL:
+                        addbyte(0x0f); /*JNE skip*/
+                        addbyte(0x85);
+                        a_skip_pos = block_pos;
+                        addlong(0);
+                        break;
+                        case AFUNC_LESSTHANEQUAL:
+                        addbyte(0x0f); /*JA skip*/
+                        addbyte(0x87);
+                        a_skip_pos = block_pos;
+                        addlong(0);
+                        break;
+                        case AFUNC_GREATERTHAN:
+                        addbyte(0x0f); /*JBE skip*/
+                        addbyte(0x86);
+                        a_skip_pos = block_pos;
+                        addlong(0);
+                        break;
+                        case AFUNC_NOTEQUAL:
+                        addbyte(0x0f); /*JE skip*/
+                        addbyte(0x84);
+                        a_skip_pos = block_pos;
+                        addlong(0);
+                        break;
+                        case AFUNC_GREATERTHANEQUAL:
+                        addbyte(0x0f); /*JB skip*/
+                        addbyte(0x82);
+                        a_skip_pos = block_pos;
+                        addlong(0);
+                        break;
+                }
+        }
+        else if ((params->alphaMode & 1) && (alpha_func == AFUNC_NEVER))
+        {
+                addbyte(0xC3); /*RET*/
+        }
+        
+        if (params->alphaMode & (1 << 4))
+        {
+                addbyte(0x8b); /*MOV EAX, state->x[EDI]*/
+                addbyte(0x87);
+                if (params->col_tiled)
+                        addlong(offsetof(voodoo_state_t, x_tiled));
+                else
+                        addlong(offsetof(voodoo_state_t, x));
+                addbyte(0x8b); /*MOV EBP, fb_mem*/
+                addbyte(0xaf);
+                addlong(offsetof(voodoo_state_t, fb_mem));
+                addbyte(0x01); /*ADD EDX, EDX*/
+                addbyte(0xd2);
+                addbyte(0x0f); /*MOVZX EAX, [EBP+EAX*2]*/
+                addbyte(0xb7);
+                addbyte(0x44);
+                addbyte(0x45);
+                addbyte(0);
+                addbyte(0x66); /*PUNPCKLBW XMM0, XMM2*/
+                addbyte(0x0f);
+                addbyte(0x60);
+                addbyte(0xc2);
+                addbyte(0x66); /*MOVD XMM4, rgb565[EAX*4]*/
+                addbyte(0x0f);
+                addbyte(0x6e);
+                addbyte(0x24);
+                addbyte(0x85);
+                addlong((uint32_t)rgb565);
+                addbyte(0x66); /*PUNPCKLBW XMM4, XMM2*/
+                addbyte(0x0f);
+                addbyte(0x60);
+                addbyte(0xe2);
+                addbyte(0xf3); /*MOV XMM6, XMM4*/
+                addbyte(0x0f);
+                addbyte(0x7e);
+                addbyte(0xf4);
+                
+                switch (dest_afunc)
+                {
+                        case AFUNC_AZERO:
+                        addbyte(0x66); /*PXOR XMM4, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xe4);
+                        break;
+                        case AFUNC_ASRC_ALPHA:
+                        addbyte(0x66); /*PMULLW XMM4, alookup[EDX*8]*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0x24);
+                        addbyte(0xd5);
+                        addlong((uint32_t)alookup);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xec);
+                        addbyte(0x66); /*PADDW XMM4, alookup[1*8]*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0x25);
+                        addlong((uint32_t)alookup + 16);
+                        addbyte(0x66); /*PSRLW XMM5, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd5);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM4, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xe5);
+                        addbyte(0x66); /*PSRLW XMM4, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd4);
+                        addbyte(8);
+                        break;
+                        case AFUNC_A_COLOR:
+                        addbyte(0x66); /*PMULLW XMM4, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0xe0);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xec);
+                        addbyte(0x66); /*PADDW XMM4, alookup[1*8]*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0x25);
+                        addlong((uint32_t)alookup + 16);
+                        addbyte(0x66); /*PSRLW XMM5, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd5);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM4, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xe5);
+                        addbyte(0x66); /*PSRLW XMM4, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd4);
+                        addbyte(8);
+                        break;
+                        case AFUNC_ADST_ALPHA:
+                        break;
+                        case AFUNC_AONE:
+                        break;
+                        case AFUNC_AOMSRC_ALPHA:
+                        addbyte(0x66); /*PMULLW XMM4, aminuslookup[EDX*8]*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0x24);
+                        addbyte(0xd5);
+                        addlong((uint32_t)aminuslookup);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xec);
+                        addbyte(0x66); /*PADDW XMM4, alookup[1*8]*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0x25);
+                        addlong((uint32_t)alookup + 16);
+                        addbyte(0x66); /*PSRLW XMM5, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd5);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM4, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xe5);
+                        addbyte(0x66); /*PSRLW XMM4, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd4);
+                        addbyte(8);
+                        break;
+                        case AFUNC_AOM_COLOR:
+                        addbyte(0xf3); /*MOVQ XMM5, xmm_ff_w*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0x2d);
+                        addlong((uint32_t)&xmm_ff_w);
+                        addbyte(0x66); /*PSUBW XMM5, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0xf9);
+                        addbyte(0xe8);
+                        addbyte(0x66); /*PMULLW XMM4, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0xe5);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xec);
+                        addbyte(0x66); /*PADDW XMM4, alookup[1*8]*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0x25);
+                        addlong((uint32_t)alookup + 16);
+                        addbyte(0x66); /*PSRLW XMM5, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd5);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM4, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xe5);
+                        addbyte(0x66); /*PSRLW XMM4, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd4);
+                        addbyte(8);
+                        break;
+                        case AFUNC_AOMDST_ALPHA:
+                        addbyte(0x66); /*PXOR XMM4, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xe4);
+                        break;
+                        case AFUNC_ASATURATE:
+                        addbyte(0x66); /*PMULLW XMM4, minus_254*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0x25);
+                        addlong((uint32_t)&minus_254);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM4*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xec);
+                        addbyte(0x66); /*PADDW XMM4, alookup[1*8]*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0x25);
+                        addlong((uint32_t)alookup + 16);
+                        addbyte(0x66); /*PSRLW XMM5, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd5);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM4, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xe5);
+                        addbyte(0x66); /*PSRLW XMM4, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd4);
+                        addbyte(8);
+                }
+
+                switch (src_afunc)
+                {
+                        case AFUNC_AZERO:
+                        addbyte(0x66); /*PXOR XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xc0);
+                        break;
+                        case AFUNC_ASRC_ALPHA:
+                        addbyte(0x66); /*PMULLW XMM0, alookup[EDX*8]*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0x04);
+                        addbyte(0xd5);
+                        addlong((uint32_t)alookup);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xe8);
+                        addbyte(0x66); /*PADDW XMM0, alookup[1*8]*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0x05);
+                        addlong((uint32_t)alookup + 16);
+                        addbyte(0x66); /*PSRLW XMM5, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd5);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM0, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xc5);
+                        addbyte(0x66); /*PSRLW XMM0, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd0);
+                        addbyte(8);
+                        break;
+                        case AFUNC_A_COLOR:
+                        addbyte(0x66); /*PMULLW XMM0, XMM6*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0xc6);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xe8);
+                        addbyte(0x66); /*PADDW XMM0, alookup[1*8]*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0x05);
+                        addlong((uint32_t)alookup + 16);
+                        addbyte(0x66); /*PSRLW XMM5, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd5);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM0, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xc5);
+                        addbyte(0x66); /*PSRLW XMM0, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd0);
+                        addbyte(8);
+                        break;
+                        case AFUNC_ADST_ALPHA:
+                        break;
+                        case AFUNC_AONE:
+                        break;
+                        case AFUNC_AOMSRC_ALPHA:
+                        addbyte(0x66); /*PMULLW XMM0, aminuslookup[EDX*8]*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0x04);
+                        addbyte(0xd5);
+                        addlong((uint32_t)aminuslookup);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xe8);
+                        addbyte(0x66); /*PADDW XMM0, alookup[1*8]*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0x05);
+                        addlong((uint32_t)alookup + 16);
+                        addbyte(0x66); /*PSRLW XMM5, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd5);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM0, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xc5);
+                        addbyte(0x66); /*PSRLW XMM0, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd0);
+                        addbyte(8);
+                        break;
+                        case AFUNC_AOM_COLOR:
+                        addbyte(0xf3); /*MOVQ XMM5, xmm_ff_w*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0x2d);
+                        addlong((uint32_t)&xmm_ff_w);
+                        addbyte(0x66); /*PSUBW XMM5, XMM6*/
+                        addbyte(0x0f);
+                        addbyte(0xf9);
+                        addbyte(0xee);
+                        addbyte(0x66); /*PMULLW XMM0, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xd5);
+                        addbyte(0xc5);
+                        addbyte(0xf3); /*MOVQ XMM5, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0x7e);
+                        addbyte(0xe8);
+                        addbyte(0x66); /*PADDW XMM0, alookup[1*8]*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0x05);
+                        addlong((uint32_t)alookup + 16);
+                        addbyte(0x66); /*PSRLW XMM5, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd5);
+                        addbyte(8);
+                        addbyte(0x66); /*PADDW XMM0, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfd);
+                        addbyte(0xc5);
+                        addbyte(0x66); /*PSRLW XMM0, 8*/
+                        addbyte(0x0f);
+                        addbyte(0x71);
+                        addbyte(0xd0);
+                        addbyte(8);
+                        break;
+                        case AFUNC_AOMDST_ALPHA:
+                        addbyte(0x66); /*PXOR XMM0, XMM0*/
+                        addbyte(0x0f);
+                        addbyte(0xef);
+                        addbyte(0xc0);
+                        break;
+                        case AFUNC_ACOLORBEFOREFOG:
+                        break;
+                }
+                
+                addbyte(0x66); /*PADDW XMM0, XMM4*/
+                addbyte(0x0f);
+                addbyte(0xfd);
+                addbyte(0xc4);
+
+                addbyte(0x66); /*PACKUSWB XMM0, XMM0*/
+                addbyte(0x0f);
+                addbyte(0x67);
+                addbyte(0xc0);
+        }
+//#endif        
+
+//        addbyte(0x8b); /*MOV EDX, x (ESP+12)*/
+//        addbyte(0x54);
+//        addbyte(0x24);
+//        addbyte(12);
+
+
+        addbyte(0x8b); /*MOV EDX, state->x[EDI]*/
+        addbyte(0x97);
+        if (params->col_tiled)
+                addlong(offsetof(voodoo_state_t, x_tiled));
+        else
+                addlong(offsetof(voodoo_state_t, x));
+
+        addbyte(0x66); /*MOV EAX, XMM0*/
+        addbyte(0x0f);
+        addbyte(0x7e);
+        addbyte(0xc0);
+        
+        if (params->fbzMode & FBZ_RGB_WMASK)
+        {
+//                addbyte(0x89); /*MOV state->rgb_out[EDI], EAX*/
+//                addbyte(0x87);
+//                addlong(offsetof(voodoo_state_t, rgb_out));
+                
+                if (dither)
+                {
+                        addbyte(0x8b); /*MOV ESI, real_y (ESP+16)*/
+                        addbyte(0x74);
+                        addbyte(0x24);
+                        addbyte(16+16);
+                        addbyte(0x0f); /*MOVZX EBX, AH*/ /*G*/
+                        addbyte(0xb6);
+                        addbyte(0xdc);
+                        if (dither2x2)
+                        {
+                                addbyte(0x83); /*AND EDX, 1*/
+                                addbyte(0xe2);
+                                addbyte(1);
+                                addbyte(0x83); /*AND ESI, 1*/
+                                addbyte(0xe6);
+                                addbyte(1);
+                                addbyte(0xc1); /*SHL EBX, 2*/
+                                addbyte(0xe3);
+                                addbyte(2);
+                        }
+                        else
+                        {
+                                addbyte(0x83); /*AND EDX, 3*/
+                                addbyte(0xe2);
+                                addbyte(3);
+                                addbyte(0x83); /*AND ESI, 3*/
+                                addbyte(0xe6);
+                                addbyte(3);
+                                addbyte(0xc1); /*SHL EBX, 4*/
+                                addbyte(0xe3);
+                                addbyte(4);
+                        }
+                        addbyte(0x0f); /*MOVZX ECX, AL*/ /*R*/
+                        addbyte(0xb6);
+                        addbyte(0xc8);
+                        if (dither2x2)
+                        {
+                                addbyte(0xc1); /*SHR EAX, 14*/
+                                addbyte(0xe8);
+                                addbyte(14);
+                                addbyte(0x8d); /*LEA ESI, EDX+ESI*2*/
+                                addbyte(0x34);
+                                addbyte(0x72);
+                        }
+                        else
+                        {
+                                addbyte(0xc1); /*SHR EAX, 12*/
+                                addbyte(0xe8);
+                                addbyte(12);
+                                addbyte(0x8d); /*LEA ESI, EDX+ESI*4*/
+                                addbyte(0x34);
+                                addbyte(0xb2);
+                        }
+                        addbyte(0x8b); /*MOV EDX, state->x[EDI]*/
+                        addbyte(0x97);
+                        if (params->col_tiled)
+                                addlong(offsetof(voodoo_state_t, x_tiled));
+                        else
+                                addlong(offsetof(voodoo_state_t, x));
+                        if (dither2x2)
+                        {
+                                addbyte(0xc1); /*SHL ECX, 2*/
+                                addbyte(0xe1);
+                                addbyte(2);
+                                addbyte(0x25); /*AND EAX, 0x3fc*/ /*B*/
+                                addlong(0x3fc);
+                        }
+                        else
+                        {
+                                addbyte(0xc1); /*SHL ECX, 4*/
+                                addbyte(0xe1);
+                                addbyte(4);
+                                addbyte(0x25); /*AND EAX, 0xff0*/ /*B*/
+                                addlong(0xff0);
+                        }
+                        addbyte(0x0f); /*MOVZX EBX, dither_g[EBX+ESI]*/
+                        addbyte(0xb6);
+                        addbyte(0x9c);
+                        addbyte(0x33);
+                        addlong(dither2x2 ? (uint32_t)dither_g2x2 : (uint32_t)dither_g);
+                        addbyte(0x0f); /*MOVZX ECX, dither_rb[ECX+ESI]*/
+                        addbyte(0xb6);
+                        addbyte(0x8c);
+                        addbyte(0x31);
+                        addlong(dither2x2 ? (uint32_t)dither_rb2x2 : (uint32_t)dither_rb);
+                        addbyte(0x0f); /*MOVZX EAX, dither_rb[EAX+ESI]*/
+                        addbyte(0xb6);
+                        addbyte(0x84);
+                        addbyte(0x30);
+                        addlong(dither2x2 ? (uint32_t)dither_rb2x2 : (uint32_t)dither_rb);
+                        addbyte(0xc1); /*SHL EBX, 5*/
+                        addbyte(0xe3);
+                        addbyte(5);
+                        addbyte(0xc1); /*SHL EAX, 11*/
+                        addbyte(0xe0);
+                        addbyte(11);
+                        addbyte(0x09); /*OR EAX, EBX*/
+                        addbyte(0xd8);
+                        addbyte(0x09); /*OR EAX, ECX*/
+                        addbyte(0xc8);
+                }
+                else
+                {
+                        addbyte(0x89); /*MOV EBX, EAX*/
+                        addbyte(0xc3);
+                        addbyte(0x0f); /*MOVZX ECX, AH*/
+                        addbyte(0xb6);
+                        addbyte(0xcc);
+                        addbyte(0xc1); /*SHR EAX, 3*/
+                        addbyte(0xe8);
+                        addbyte(3);
+                        addbyte(0xc1); /*SHR EBX, 8*/
+                        addbyte(0xeb);
+                        addbyte(8);
+                        addbyte(0xc1); /*SHL ECX, 3*/
+                        addbyte(0xe1);
+                        addbyte(3);
+                        addbyte(0x81); /*AND EAX, 0x001f*/
+                        addbyte(0xe0);
+                        addlong(0x001f);
+                        addbyte(0x81); /*AND EBX, 0xf800*/
+                        addbyte(0xe3);
+                        addlong(0xf800);
+                        addbyte(0x81); /*AND ECX, 0x07e0*/
+                        addbyte(0xe1);
+                        addlong(0x07e0);
+                        addbyte(0x09); /*OR EAX, EBX*/
+                        addbyte(0xd8);
+                        addbyte(0x09); /*OR EAX, ECX*/
+                        addbyte(0xc8);
+                }
+                addbyte(0x8b); /*MOV ESI, fb_mem*/
+                addbyte(0xb7);
+                addlong(offsetof(voodoo_state_t, fb_mem));
+                addbyte(0x66); /*MOV [ESI+EDX*2], AX*/
+                addbyte(0x89);
+                addbyte(0x04);
+                addbyte(0x56);
+        }
+
+        if ((params->fbzMode & (FBZ_DEPTH_WMASK | FBZ_DEPTH_ENABLE)) == (FBZ_DEPTH_WMASK | FBZ_DEPTH_ENABLE))
+        {
+                addbyte(0x8b); /*MOV EDX, state->x[EDI]*/
+                addbyte(0x97);
+                if (params->aux_tiled)
+                        addlong(offsetof(voodoo_state_t, x_tiled));
+                else
+                        addlong(offsetof(voodoo_state_t, x));
+                addbyte(0x66); /*MOV AX, new_depth*/
+                addbyte(0x8b);
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, new_depth));
+                addbyte(0x8b); /*MOV ESI, aux_mem*/
+                addbyte(0xb7);
+                addlong(offsetof(voodoo_state_t, aux_mem));
+                addbyte(0x66); /*MOV [ESI+EDX*2], AX*/
+                addbyte(0x89);
+                addbyte(0x04);
+                addbyte(0x56);
+        }
+
+        if (z_skip_pos)
+                *(uint32_t *)&code_block[z_skip_pos] = (block_pos - z_skip_pos) - 4;
+        if (a_skip_pos)
+                *(uint32_t *)&code_block[a_skip_pos] = (block_pos - a_skip_pos) - 4;
+        if (chroma_skip_pos)
+                *(uint32_t *)&code_block[chroma_skip_pos] = (block_pos - chroma_skip_pos) - 4;
+
+
+        addbyte(0x8b); /*MOV ESI, [ESP+8]*/
+        addbyte(0x74);
+        addbyte(0x24);
+        addbyte(8+16);
+
+        if (voodoo->dual_tmus)
+        {
+                addbyte(0xf3); /*MOVDQU XMM3, state->tmu1_s[EDI]*/
+                addbyte(0x0f);
+                addbyte(0x6f);
+                addbyte(0x9f);
+                addlong(offsetof(voodoo_state_t, tmu1_s));
+                addbyte(0xf3); /*MOVQ XMM4, state->tmu1_w[EDI]*/
+                addbyte(0x0f);
+                addbyte(0x7e);
+                addbyte(0xa7);
+                addlong(offsetof(voodoo_state_t, tmu1_w));
+                addbyte(0xf3); /*MOVDQU XMM5, params->tmu[1].dSdX[ESI]*/
+                addbyte(0x0f);
+                addbyte(0x6f);
+                addbyte(0xae);
+                addlong(offsetof(voodoo_params_t, tmu[1].dSdX));
+                addbyte(0xf3); /*MOVQ XMM6, params->tmu[1].dWdX[ESI]*/
+                addbyte(0x0f);
+                addbyte(0x7e);
+                addbyte(0xb6);
+                addlong(offsetof(voodoo_params_t, tmu[1].dWdX));
+                if (state->xdir > 0)
+                {
+                        addbyte(0x66); /*PADDQ XMM3, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xd4);
+                        addbyte(0xdd);
+                        addbyte(0x66); /*PADDQ XMM4, XMM6*/
+                        addbyte(0x0f);
+                        addbyte(0xd4);
+                        addbyte(0xe6);
+                }
+                else
+                {
+                        addbyte(0x66); /*PSUBQ XMM3, XMM5*/
+                        addbyte(0x0f);
+                        addbyte(0xfb);
+                        addbyte(0xdd);
+                        addbyte(0x66); /*PSUBQ XMM4, XMM6*/
+                        addbyte(0x0f);
+                        addbyte(0xfb);
+                        addbyte(0xe6);
+                }
+                addbyte(0xf3); /*MOVDQU state->tmu1_s, XMM3*/
+                addbyte(0x0f);
+                addbyte(0x7f);
+                addbyte(0x9f);
+                addlong(offsetof(voodoo_state_t, tmu1_s));
+                addbyte(0x66); /*MOVQ state->tmu1_w, XMM4*/
+                addbyte(0x0f);
+                addbyte(0xd6);
+                addbyte(0xa7);
+                addlong(offsetof(voodoo_state_t, tmu1_w));
+        }
+
+        addbyte(0xf3); /*MOVDQU XMM1, state->ib[EDI]*/
+        addbyte(0x0f);
+        addbyte(0x6f);
+        addbyte(0x8f);
+        addlong(offsetof(voodoo_state_t, ib));
+        addbyte(0xf3); /*MOVDQU XMM3, state->tmu0_s[EDI]*/
+        addbyte(0x0f);
+        addbyte(0x6f);
+        addbyte(0x9f);
+        addlong(offsetof(voodoo_state_t, tmu0_s));
+        addbyte(0xf3); /*MOVQ XMM4, state->tmu0_w[EDI]*/
+        addbyte(0x0f);
+        addbyte(0x7e);
+        addbyte(0xa7);
+        addlong(offsetof(voodoo_state_t, tmu0_w));
+        addbyte(0xf3); /*MOVDQU XMM0, params->dBdX[ESI]*/
+        addbyte(0x0f);
+        addbyte(0x6f);
+        addbyte(0x86);
+        addlong(offsetof(voodoo_params_t, dBdX));       
+        addbyte(0x8b); /*MOV EAX, params->dZdX[ESI]*/
+        addbyte(0x86);
+        addlong(offsetof(voodoo_params_t, dZdX));
+        addbyte(0xf3); /*MOVDQU XMM5, params->tmu[0].dSdX[ESI]*/
+        addbyte(0x0f);
+        addbyte(0x6f);
+        addbyte(0xae);
+        addlong(offsetof(voodoo_params_t, tmu[0].dSdX));
+        addbyte(0xf3); /*MOVQ XMM6, params->tmu[0].dWdX[ESI]*/
+        addbyte(0x0f);
+        addbyte(0x7e);
+        addbyte(0xb6);
+        addlong(offsetof(voodoo_params_t, tmu[0].dWdX));
+
+        if (state->xdir > 0)
+        {
+                addbyte(0x66); /*PADDD XMM1, XMM0*/
+                addbyte(0x0f);
+                addbyte(0xfe);
+                addbyte(0xc8);
+        }
+        else
+        {
+                addbyte(0x66); /*PSUBD XMM1, XMM0*/
+                addbyte(0x0f);
+                addbyte(0xfa);
+                addbyte(0xc8);
+        }
+
+        addbyte(0xf3); /*MOVQ XMM0, state->w*/
+        addbyte(0x0f);
+        addbyte(0x7e);
+        addbyte(0x87);
+        addlong(offsetof(voodoo_state_t, w));
+        addbyte(0xf3); /*MOVDQU state->ib, XMM1*/
+        addbyte(0x0f);
+        addbyte(0x7f);
+        addbyte(0x8f);
+        addlong(offsetof(voodoo_state_t, ib));
+        addbyte(0xf3); /*MOVQ XMM7, params->dWdX*/
+        addbyte(0x0f);
+        addbyte(0x7e);
+        addbyte(0xbe);
+        addlong(offsetof(voodoo_params_t, dWdX));
+
+        if (state->xdir > 0)
+        {
+                addbyte(0x66); /*PADDQ XMM3, XMM5*/
+                addbyte(0x0f);
+                addbyte(0xd4);
+                addbyte(0xdd);
+                addbyte(0x66); /*PADDQ XMM4, XMM6*/
+                addbyte(0x0f);
+                addbyte(0xd4);
+                addbyte(0xe6);
+                addbyte(0x66); /*PADDQ XMM0, XMM7*/
+                addbyte(0x0f);
+                addbyte(0xd4);
+                addbyte(0xc7);
+                addbyte(0x01); /*ADD state->z[EDI], EAX*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, z));        
+        }
+        else
+        {
+                addbyte(0x66); /*PSUBQ XMM3, XMM5*/
+                addbyte(0x0f);
+                addbyte(0xfb);
+                addbyte(0xdd);
+                addbyte(0x66); /*PSUBQ XMM4, XMM6*/
+                addbyte(0x0f);
+                addbyte(0xfb);
+                addbyte(0xe6);
+                addbyte(0x66); /*PSUBQ XMM0, XMM7*/
+                addbyte(0x0f);
+                addbyte(0xfb);
+                addbyte(0xc7);
+                addbyte(0x29); /*SUB state->z[EDI], EAX*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, z));        
+        }
+
+        addbyte(0xf3); /*MOVDQU state->tmu0_s, XMM3*/
+        addbyte(0x0f);
+        addbyte(0x7f);
+        addbyte(0x9f);
+        addlong(offsetof(voodoo_state_t, tmu0_s));
+        addbyte(0x66); /*MOVQ state->tmu0_w, XMM4*/
+        addbyte(0x0f);
+        addbyte(0xd6);
+        addbyte(0xa7);
+        addlong(offsetof(voodoo_state_t, tmu0_w));
+        addbyte(0x66); /*MOVQ state->w, XMM0*/
+        addbyte(0x0f);
+        addbyte(0xd6);
+        addbyte(0x87);
+        addlong(offsetof(voodoo_state_t, w));
+        
+        addbyte(0x83); /*ADD state->pixel_count[EDI], 1*/
+        addbyte(0x87);
+        addlong(offsetof(voodoo_state_t, pixel_count));
+        addbyte(1);
+
+        if (params->fbzColorPath & FBZCP_TEXTURE_ENABLED)
+        {
+                if ((params->textureMode[0] & TEXTUREMODE_MASK) == TEXTUREMODE_PASSTHROUGH ||
+                    (params->textureMode[0] & TEXTUREMODE_LOCAL_MASK) == TEXTUREMODE_LOCAL)
+                {
+                        addbyte(0x83); /*ADD state->texel_count[EDI], 1*/
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, texel_count));
+                        addbyte(1);
+                }
+                else
+                {
+                        addbyte(0x83); /*ADD state->texel_count[EDI], 2*/
+                        addbyte(0x87);
+                        addlong(offsetof(voodoo_state_t, texel_count));
+                        addbyte(2);
+                }                
+        }
+        addbyte(0x8b); /*MOV EAX, state->x[EDI]*/
+        addbyte(0x87);
+        addlong(offsetof(voodoo_state_t, x));
+        
+        if (state->xdir > 0)
+        {
+                addbyte(0x83); /*ADD state->x[EDI], 1*/
+                addbyte(0x87);
+                addlong(offsetof(voodoo_state_t, x));
+                addbyte(1);
+        }
+        else
+        {
+                addbyte(0x83); /*SUB state->x[EDI], 1*/
+                addbyte(0xaf);
+                addlong(offsetof(voodoo_state_t, x));
+                addbyte(1);
+        }
+
+        addbyte(0x3b); /*CMP EAX, state->x2[EDI]*/
+        addbyte(0x87);
+        addlong(offsetof(voodoo_state_t, x2));
+        addbyte(0x0f); /*JNZ loop_jump_pos*/
+        addbyte(0x85);
+        addlong(loop_jump_pos - (block_pos + 4));
+        
+        addbyte(0x5b); /*POP EBX*/        
+        addbyte(0x5e); /*POP ESI*/
+        addbyte(0x5f); /*POP EDI*/
+        addbyte(0x5d); /*POP EBP*/
+        
+        addbyte(0xC3); /*RET*/
+        
+        if (params->textureMode[1] & TEXTUREMODE_TRILINEAR)
+                cs = cs;
+}
+int voodoo_recomp = 0;
+
+static inline void *voodoo_get_block(voodoo_t *voodoo, voodoo_params_t *params, voodoo_state_t *state, int odd_even)
+{
+        int c;
+        int b = last_block[odd_even];
+        voodoo_x86_data_t *data;
+        voodoo_x86_data_t *codegen_data = voodoo->codegen_data;
+        
+        for (c = 0; c < 8; c++)
+        {
+                data = &codegen_data[odd_even + b*4];
+                
+                if (state->xdir == data->xdir &&
+                    params->alphaMode == data->alphaMode &&
+                    params->fbzMode == data->fbzMode &&
+                    params->fogMode == data->fogMode &&
+                    params->fbzColorPath == data->fbzColorPath &&
+                    (voodoo->trexInit1[0] & (1 << 18)) == data->trexInit1 &&
+                    params->textureMode[0] == data->textureMode[0] &&
+                    params->textureMode[1] == data->textureMode[1] &&
+                    (params->tLOD[0] & LOD_MASK) == data->tLOD[0] &&
+                    (params->tLOD[1] & LOD_MASK) == data->tLOD[1] &&
+                    ((params->col_tiled || params->aux_tiled) ? 1 : 0) == data->is_tiled)
+                {
+                        last_block[odd_even] = b;
+                        return data->code_block;
+                }
+                
+                b = (b + 1) & 7;
+        }
+voodoo_recomp++;
+        data = &codegen_data[odd_even + next_block_to_write[odd_even]*4];
+//        code_block = data->code_block;
+        
+        voodoo_generate(data->code_block, voodoo, params, state, depth_op);
+
+        data->xdir = state->xdir;
+        data->alphaMode = params->alphaMode;
+        data->fbzMode = params->fbzMode;
+        data->fogMode = params->fogMode;
+        data->fbzColorPath = params->fbzColorPath;
+        data->trexInit1 = voodoo->trexInit1[0] & (1 << 18);
+        data->textureMode[0] = params->textureMode[0];
+        data->textureMode[1] = params->textureMode[1];
+        data->tLOD[0] = params->tLOD[0] & LOD_MASK;
+        data->tLOD[1] = params->tLOD[1] & LOD_MASK;
+        data->is_tiled = (params->col_tiled || params->aux_tiled) ? 1 : 0;
+
+        next_block_to_write[odd_even] = (next_block_to_write[odd_even] + 1) & 7;
+        
+        return data->code_block;
+}
+
+void voodoo_codegen_init(voodoo_t *voodoo)
+{
+        int c;
+#if defined(__linux__) || defined(__APPLE__)
+       void *start;
+       size_t len;
+       long pagesize = sysconf(_SC_PAGESIZE);
+       long pagemask = ~(pagesize - 1);
+#endif
+
+#if defined WIN32 || defined _WIN32 || defined _WIN32
+        voodoo->codegen_data = VirtualAlloc(NULL, sizeof(voodoo_x86_data_t) * BLOCK_NUM*4, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
+#else
+        voodoo->codegen_data = mmap(0, sizeof(voodoo_x86_data_t) * BLOCK_NUM*4, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_PRIVATE, 0, 0);
+#endif
+
+        for (c = 0; c < 256; c++)
+        {
+                int d[4];
+                int _ds = c & 0xf;
+                int dt = c >> 4;
+                
+                alookup[c] = _mm_set_epi32(0, 0, c | (c << 16), c | (c << 16));
+                aminuslookup[c] = _mm_set_epi32(0, 0, (255-c) | ((255-c) << 16), (255-c) | ((255-c) << 16));
+
+                d[0] = (16 - _ds) * (16 - dt);
+                d[1] =  _ds * (16 - dt);
+                d[2] = (16 - _ds) * dt;
+                d[3] = _ds * dt;
+
+                bilinear_lookup[c*2]     = _mm_set_epi32(d[1] | (d[1] << 16), d[1] | (d[1] << 16), d[0] | (d[0] << 16), d[0] | (d[0] << 16));
+                bilinear_lookup[c*2 + 1] = _mm_set_epi32(d[3] | (d[3] << 16), d[3] | (d[3] << 16), d[2] | (d[2] << 16), d[2] | (d[2] << 16));
+        }
+        alookup[256] = _mm_set_epi32(0, 0, 256 | (256 << 16), 256 | (256 << 16));
+        xmm_00_ff_w[0] = _mm_set_epi32(0, 0, 0, 0);
+        xmm_00_ff_w[1] = _mm_set_epi32(0, 0, 0xff | (0xff << 16), 0xff | (0xff << 16));
+}
+
+void voodoo_codegen_close(voodoo_t *voodoo)
+{
+#if defined WIN32 || defined _WIN32 || defined _WIN32
+        VirtualFree(voodoo->codegen_data, 0, MEM_RELEASE);
+#else
+        munmap(voodoo->codegen_data, sizeof(voodoo_x86_data_t) * BLOCK_NUM*4);
+#endif
+}
diff --git a/pcem/vid_voodoo_common.h b/pcem/vid_voodoo_common.h
new file mode 100644 (file)
index 0000000..df261af
--- /dev/null
@@ -0,0 +1,509 @@
+#ifdef MIN
+#undef MIN
+#endif
+#ifdef CLAMP
+#undef CLAMP
+#endif
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+#define CLAMP(x) (((x) < 0) ? 0 : (((x) > 0xff) ? 0xff : (x)))
+#define CLAMP16(x) (((x) < 0) ? 0 : (((x) > 0xffff) ? 0xffff : (x)))
+
+
+#define LOD_MAX 8
+
+#define TEX_DIRTY_SHIFT 10
+
+#define TEX_CACHE_MAX 64
+
+enum
+{
+        VOODOO_1 = 0,
+        VOODOO_SB50,
+        VOODOO_2,
+        VOODOO_BANSHEE,
+        VOODOO_3
+};
+
+typedef union int_float
+{
+        uint32_t i;
+        float f;
+} int_float;
+
+typedef struct rgb_t
+{
+        uint8_t b, g, r;
+        uint8_t pad;
+} rgb_t;
+typedef struct rgba8_t
+{
+        uint8_t b, g, r, a;
+} rgba8_t;
+
+typedef union rgba_u
+{
+        struct
+        {
+                uint8_t b, g, r, a;
+        } rgba;
+        uint32_t u;
+} rgba_u;
+
+#define FIFO_SIZE 65536
+#define FIFO_MASK (FIFO_SIZE - 1)
+#define FIFO_ENTRY_SIZE (1 << 31)
+
+#define FIFO_ENTRIES (voodoo->fifo_write_idx - voodoo->fifo_read_idx)
+#define FIFO_FULL    ((voodoo->fifo_write_idx - voodoo->fifo_read_idx) >= FIFO_SIZE-4)
+#define FIFO_EMPTY   (voodoo->fifo_read_idx == voodoo->fifo_write_idx)
+
+#define FIFO_TYPE 0xff000000
+#define FIFO_ADDR 0x00ffffff
+
+enum
+{
+        FIFO_INVALID      = (0x00 << 24),
+        FIFO_WRITEL_REG   = (0x01 << 24),
+        FIFO_WRITEW_FB    = (0x02 << 24),
+        FIFO_WRITEL_FB    = (0x03 << 24),
+        FIFO_WRITEL_TEX   = (0x04 << 24),
+        FIFO_WRITEL_2DREG = (0x05 << 24)
+};
+
+#define PARAM_SIZE 1024
+#define PARAM_MASK (PARAM_SIZE - 1)
+#define PARAM_ENTRY_SIZE (1 << 31)
+
+#define PARAM_ENTRIES(x) (voodoo->params_write_idx - voodoo->params_read_idx[x])
+#define PARAM_FULL(x)    ((voodoo->params_write_idx - voodoo->params_read_idx[x]) >= PARAM_SIZE)
+#define PARAM_EMPTY(x)   (voodoo->params_read_idx[x] == voodoo->params_write_idx)
+
+typedef struct
+{
+        uint32_t addr_type;
+        uint32_t val;
+} fifo_entry_t;
+
+typedef struct voodoo_params_t
+{
+        int command;
+
+        int32_t vertexAx, vertexAy, vertexBx, vertexBy, vertexCx, vertexCy;
+
+        uint32_t startR, startG, startB, startZ, startA;
+
+         int32_t dBdX, dGdX, dRdX, dAdX, dZdX;
+
+         int32_t dBdY, dGdY, dRdY, dAdY, dZdY;
+
+        int64_t startW, dWdX, dWdY;
+
+        struct
+        {
+                int64_t startS, startT, startW, p1;
+                int64_t dSdX, dTdX, dWdX, p2;
+                int64_t dSdY, dTdY, dWdY, p3;
+        } tmu[2];
+
+        uint32_t color0, color1;
+
+        uint32_t fbzMode;
+        uint32_t fbzColorPath;
+
+        uint32_t fogMode;
+        rgb_t fogColor;
+        struct
+        {
+                uint8_t fog, dfog;
+        } fogTable[64];
+
+        uint32_t alphaMode;
+
+        uint32_t zaColor;
+
+        int chromaKey_r, chromaKey_g, chromaKey_b;
+        uint32_t chromaKey;
+
+        uint32_t textureMode[2];
+        uint32_t tLOD[2];
+
+        uint32_t texBaseAddr[2], texBaseAddr1[2], texBaseAddr2[2], texBaseAddr38[2];
+
+        uint32_t tex_base[2][LOD_MAX+2];
+        uint32_t tex_end[2][LOD_MAX+2];
+        int tex_width[2];
+        int tex_w_mask[2][LOD_MAX+2];
+        int tex_w_nmask[2][LOD_MAX+2];
+        int tex_h_mask[2][LOD_MAX+2];
+        int tex_shift[2][LOD_MAX+2];
+        int tex_lod[2][LOD_MAX+2];
+        int tex_entry[2];
+        int detail_max[2], detail_bias[2], detail_scale[2];
+
+        uint32_t draw_offset, aux_offset;
+
+        int tformat[2];
+
+        int clipLeft, clipRight, clipLowY, clipHighY;
+        int clipLeft1, clipRight1, clipLowY1, clipHighY1;
+
+        int sign;
+
+        uint32_t front_offset;
+
+        uint32_t swapbufferCMD;
+
+        uint32_t stipple;
+
+        int col_tiled, aux_tiled;
+        int row_width, aux_row_width;
+} voodoo_params_t;
+
+typedef struct texture_t
+{
+        uint32_t base;
+        uint32_t tLOD;
+        volatile int refcount, refcount_r[4];
+        int is16;
+        uint32_t palette_checksum;
+        uint32_t addr_start[4], addr_end[4];
+        uint32_t *data;
+} texture_t;
+
+typedef struct vert_t
+{
+        float sVx, sVy;
+        float sRed, sGreen, sBlue, sAlpha;
+        float sVz, sWb;
+        float sW0, sS0, sT0;
+        float sW1, sS1, sT1;
+} vert_t;
+
+typedef struct clip_t
+{
+        int x_min, x_max;
+        int y_min, y_max;
+} clip_t;
+
+typedef struct voodoo_t
+{
+        mem_mapping_t mapping;
+
+        int pci_enable;
+
+        uint8_t dac_data[8];
+        int dac_reg, dac_reg_ff;
+        uint8_t dac_readdata;
+        uint16_t dac_pll_regs[16];
+
+        float pixel_clock;
+        uint64_t line_time;
+
+        voodoo_params_t params;
+
+        uint32_t fbiInit0, fbiInit1, fbiInit2, fbiInit3, fbiInit4;
+        uint32_t fbiInit5, fbiInit6, fbiInit7; /*Voodoo 2*/
+
+        uint32_t initEnable;
+
+        uint32_t lfbMode;
+
+        uint32_t memBaseAddr;
+
+        int_float fvertexAx, fvertexAy, fvertexBx, fvertexBy, fvertexCx, fvertexCy;
+
+        uint32_t front_offset, back_offset;
+
+        uint32_t fb_read_offset, fb_write_offset;
+
+        int row_width, aux_row_width;
+        int block_width;
+        
+        int col_tiled, aux_tiled;
+
+        uint8_t *fb_mem, *tex_mem[2];
+        uint16_t *tex_mem_w[2];
+
+        int rgb_sel;
+
+        uint32_t trexInit1[2];
+
+        uint32_t tmuConfig;
+
+        mutex_t *swap_mutex;
+        int swap_count;
+
+        int disp_buffer, draw_buffer;
+        pc_timer_t timer;
+
+        int line;
+        svga_t *svga;
+
+        uint32_t backPorch;
+        uint32_t videoDimensions;
+        uint32_t hSync, vSync;
+
+        int h_total, v_total, v_disp;
+        int h_disp;
+        int v_retrace;
+
+        struct
+        {
+                uint32_t y[4], i[4], q[4];
+        } nccTable[2][2];
+
+        rgba_u palette[2][256];
+
+        rgba_u ncc_lookup[2][2][256];
+        int ncc_dirty[2];
+
+        thread_t *fifo_thread;
+        thread_t *render_thread[4];
+        event_t *wake_fifo_thread;
+        event_t *wake_main_thread;
+        event_t *fifo_not_full_event;
+        event_t *render_not_full_event[4];
+        event_t *wake_render_thread[4];
+
+        int voodoo_busy;
+        int render_voodoo_busy[4];
+
+        int render_threads;
+        int odd_even_mask;
+
+        int pixel_count[4], texel_count[4], tri_count, frame_count;
+        int pixel_count_old[4], texel_count_old[4];
+        int wr_count, rd_count, tex_count;
+
+        int retrace_count;
+        int swap_interval;
+        uint32_t swap_offset;
+        int swap_pending;
+
+        int bilinear_enabled;
+
+        int fb_size;
+        uint32_t fb_mask;
+
+        int texture_size;
+        uint32_t texture_mask;
+
+        int dual_tmus;
+        int type;
+
+        fifo_entry_t fifo[FIFO_SIZE];
+        volatile int fifo_read_idx, fifo_write_idx;
+        volatile int cmd_read, cmd_written, cmd_written_fifo;
+
+        voodoo_params_t params_buffer[PARAM_SIZE];
+        volatile int params_read_idx[4], params_write_idx;
+
+        uint32_t cmdfifo_base, cmdfifo_end, cmdfifo_size;
+        int cmdfifo_rp, cmdfifo_ret_addr;
+        int cmdfifo_in_sub;
+        volatile int cmdfifo_depth_rd, cmdfifo_depth_wr;
+        volatile int cmdfifo_enabled;
+        uint32_t cmdfifo_amin, cmdfifo_amax;
+        int cmdfifo_holecount;
+
+        uint32_t sSetupMode;
+        vert_t verts[4];
+        unsigned int vertex_ages[3];
+        unsigned int vertex_next_age;
+        int num_verticies;
+        int cull_pingpong;
+
+        int flush;
+
+        int scrfilter;
+       int scrfilterEnabled;
+       int scrfilterThreshold;
+       int scrfilterThresholdOld;
+
+        uint32_t last_write_addr;
+
+        uint32_t fbiPixelsIn;
+        uint32_t fbiChromaFail;
+        uint32_t fbiZFuncFail;
+        uint32_t fbiAFuncFail;
+        uint32_t fbiPixelsOut;
+
+        uint32_t bltSrcBaseAddr;
+        uint32_t bltDstBaseAddr;
+        int bltSrcXYStride, bltDstXYStride;
+        uint32_t bltSrcChromaRange, bltDstChromaRange;
+        int bltSrcChromaMinR, bltSrcChromaMinG, bltSrcChromaMinB;
+        int bltSrcChromaMaxR, bltSrcChromaMaxG, bltSrcChromaMaxB;
+        int bltDstChromaMinR, bltDstChromaMinG, bltDstChromaMinB;
+        int bltDstChromaMaxR, bltDstChromaMaxG, bltDstChromaMaxB;
+
+        int bltClipRight, bltClipLeft;
+        int bltClipHighY, bltClipLowY;
+
+        int bltSrcX, bltSrcY;
+        int bltDstX, bltDstY;
+        int bltSizeX, bltSizeY;
+        int bltRop[4];
+        uint16_t bltColorFg, bltColorBg;
+
+        uint32_t bltCommand;
+
+        uint32_t leftOverlayBuf;
+        
+        struct
+        {
+                int dst_x, dst_y;
+                int cur_x;
+                int size_x, size_y;
+                int x_dir, y_dir;
+                int dst_stride;
+        } blt;
+
+        struct
+        {
+                uint32_t bresError0, bresError1;
+                uint32_t clip0Min, clip0Max;
+                uint32_t clip1Min, clip1Max;
+                uint32_t colorBack, colorFore;
+                uint32_t command, commandExtra;
+                uint32_t dstBaseAddr;
+                uint32_t dstFormat;
+                uint32_t dstSize;
+                uint32_t dstXY;
+                uint32_t lineStipple;
+                uint32_t lineStyle;
+                uint32_t rop;
+                uint32_t srcBaseAddr;
+                uint32_t srcFormat;
+                uint32_t srcSize;
+                uint32_t srcXY;
+                
+                uint32_t colorPattern[64];
+
+                int bres_error_0, bres_error_1;
+                uint32_t colorPattern8[64], colorPattern16[64], colorPattern24[64];
+                int cur_x, cur_y;
+                uint32_t dstBaseAddr_tiled;
+                uint32_t dstColorkeyMin, dstColorkeyMax;
+                int dstSizeX, dstSizeY;
+                int dstX, dstY;
+                int dst_stride;
+                int patoff_x, patoff_y;
+                uint8_t rops[4];
+                uint32_t srcBaseAddr_tiled;
+                uint32_t srcColorkeyMin, srcColorkeyMax;
+                int srcSizeX, srcSizeY;
+                int srcX, srcY;
+                int src_stride;
+                int old_srcX;
+                
+                /*Used for handling packed 24bpp host data*/
+                int host_data_remainder;
+                uint32_t old_host_data;
+                
+                /*Polyfill coordinates*/
+                int lx[2], rx[2];
+                int ly[2], ry[2];
+
+                /*Polyfill state*/
+                int error[2];
+                int dx[2], dy[2];
+                int x_inc[2]; /*y_inc is always 1 for polyfill*/
+                int lx_cur, rx_cur;
+
+                clip_t clip[2];
+                
+                uint8_t host_data[16384];
+                int host_data_count;
+                int host_data_size_src, host_data_size_dest;
+                int src_stride_src, src_stride_dest;
+
+                int src_bpp;
+
+                int line_pix_pos, line_bit_pos;
+                int line_rep_cnt, line_bit_mask_size;
+        } banshee_blt;
+        
+        struct
+        {
+                uint32_t vidOverlayStartCoords;
+                uint32_t vidOverlayEndScreenCoords;
+                uint32_t vidOverlayDudx, vidOverlayDudxOffsetSrcWidth;
+                uint32_t vidOverlayDvdy, vidOverlayDvdyOffset;
+                //uint32_t vidDesktopOverlayStride;
+                
+                int start_x, start_y;
+                int end_x, end_y;
+                int size_x, size_y;
+                int overlay_bytes;
+                
+                unsigned int src_y;
+        } overlay;
+
+        rgb_t clutData[33];
+        int clutData_dirty;
+        rgb_t clutData256[256];
+        uint32_t video_16to32[0x10000];
+
+        uint8_t dirty_line[2048];
+        int dirty_line_low, dirty_line_high;
+
+        int fb_write_buffer, fb_draw_buffer;
+        int buffer_cutoff;
+        
+        uint32_t tile_base, tile_stride;
+        int tile_stride_shift, tile_x, tile_x_real;
+
+        int read_time, write_time, burst_time;
+
+        pc_timer_t wake_timer;
+
+        /* screen filter tables */
+        uint8_t thefilter[256][256];
+        uint8_t thefilterg[256][256];
+        uint8_t thefilterb[256][256];
+        uint16_t purpleline[256][3];
+
+        texture_t texture_cache[2][TEX_CACHE_MAX];
+        uint8_t texture_present[2][16384];
+        int texture_last_removed;
+
+        uint32_t palette_checksum[2];
+        int palette_dirty[2];
+
+        uint64_t time;
+        int render_time[4];
+
+        int use_recompiler;
+        void *codegen_data;
+
+        struct voodoo_set_t *set;
+        
+        
+        uint8_t *vram, *changedvram;
+        
+        void *p;
+} voodoo_t;
+
+typedef struct voodoo_set_t
+{
+        voodoo_t *voodoos[2];
+
+        mem_mapping_t snoop_mapping;
+
+        int nr_cards;
+} voodoo_set_t;
+
+
+extern rgba8_t rgb332[0x100], ai44[0x100], rgb565[0x10000], argb1555[0x10000], argb4444[0x10000], ai88[0x10000];
+
+
+void voodoo_generate_vb_filters(voodoo_t *voodoo, int fcr, int fcg);
+
+void voodoo_recalc(voodoo_t *voodoo);
+void voodoo_update_ncc(voodoo_t *voodoo, int tmu);
+
+void *voodoo_2d3d_card_init(int type);
+void voodoo_card_close(voodoo_t *voodoo);
diff --git a/pcem/vid_voodoo_display.cpp b/pcem/vid_voodoo_display.cpp
new file mode 100644 (file)
index 0000000..9ce92dd
--- /dev/null
@@ -0,0 +1,609 @@
+#include "ibm.h"
+#include "device.h"
+#include "mem.h"
+#include "thread.h"
+#include "video.h"
+#include "vid_svga.h"
+#include "vid_voodoo.h"
+#include "vid_voodoo_common.h"
+#include "vid_voodoo_display.h"
+#include "vid_voodoo_regs.h"
+#include "vid_voodoo_render.h"
+
+void voodoo_update_ncc(voodoo_t *voodoo, int tmu)
+{
+        int tbl;
+
+        for (tbl = 0; tbl < 2; tbl++)
+        {
+                int col;
+
+                for (col = 0; col < 256; col++)
+                {
+                        int y = (col >> 4), i = (col >> 2) & 3, q = col & 3;
+                        int i_r, i_g, i_b;
+                        int q_r, q_g, q_b;
+
+                        y = (voodoo->nccTable[tmu][tbl].y[y >> 2] >> ((y & 3) * 8)) & 0xff;
+
+                        i_r = (voodoo->nccTable[tmu][tbl].i[i] >> 18) & 0x1ff;
+                        if (i_r & 0x100)
+                                i_r |= 0xfffffe00;
+                        i_g = (voodoo->nccTable[tmu][tbl].i[i] >> 9) & 0x1ff;
+                        if (i_g & 0x100)
+                                i_g |= 0xfffffe00;
+                        i_b = voodoo->nccTable[tmu][tbl].i[i] & 0x1ff;
+                        if (i_b & 0x100)
+                                i_b |= 0xfffffe00;
+
+                        q_r = (voodoo->nccTable[tmu][tbl].q[q] >> 18) & 0x1ff;
+                        if (q_r & 0x100)
+                                q_r |= 0xfffffe00;
+                        q_g = (voodoo->nccTable[tmu][tbl].q[q] >> 9) & 0x1ff;
+                        if (q_g & 0x100)
+                                q_g |= 0xfffffe00;
+                        q_b = voodoo->nccTable[tmu][tbl].q[q] & 0x1ff;
+                        if (q_b & 0x100)
+                                q_b |= 0xfffffe00;
+
+                        voodoo->ncc_lookup[tmu][tbl][col].rgba.r = CLAMP(y + i_r + q_r);
+                        voodoo->ncc_lookup[tmu][tbl][col].rgba.g = CLAMP(y + i_g + q_g);
+                        voodoo->ncc_lookup[tmu][tbl][col].rgba.b = CLAMP(y + i_b + q_b);
+                        voodoo->ncc_lookup[tmu][tbl][col].rgba.a = 0xff;
+                }
+        }
+}
+
+void voodoo_pixelclock_update(voodoo_t *voodoo)
+{
+        int m  =  (voodoo->dac_pll_regs[0] & 0x7f) + 2;
+        int n1 = ((voodoo->dac_pll_regs[0] >>  8) & 0x1f) + 2;
+        int n2 = ((voodoo->dac_pll_regs[0] >> 13) & 0x07);
+        float t = (14318184.0 * ((float)m / (float)n1)) / (float)(1 << n2);
+        double clock_const;
+        int line_length;
+
+        if ((voodoo->dac_data[6] & 0xf0) == 0x20 ||
+            (voodoo->dac_data[6] & 0xf0) == 0x60 ||
+            (voodoo->dac_data[6] & 0xf0) == 0x70)
+                t /= 2.0f;
+
+        line_length = (voodoo->hSync & 0xff) + ((voodoo->hSync >> 16) & 0x3ff);
+
+//        pclog("Pixel clock %f MHz hsync %08x line_length %d\n", t, voodoo->hSync, line_length);
+
+        voodoo->pixel_clock = t;
+
+        clock_const = cpuclock / t;
+        voodoo->line_time = (uint64_t)((double)line_length * clock_const * (double)(1ull << 32));
+}
+
+static void voodoo_calc_clutData(voodoo_t *voodoo)
+{
+        int c;
+
+        for (c = 0; c < 256; c++)
+        {
+                voodoo->clutData256[c].r = (voodoo->clutData[c >> 3].r*(8-(c & 7)) +
+                                           voodoo->clutData[(c >> 3)+1].r*(c & 7)) >> 3;
+                voodoo->clutData256[c].g = (voodoo->clutData[c >> 3].g*(8-(c & 7)) +
+                                           voodoo->clutData[(c >> 3)+1].g*(c & 7)) >> 3;
+                voodoo->clutData256[c].b = (voodoo->clutData[c >> 3].b*(8-(c & 7)) +
+                                           voodoo->clutData[(c >> 3)+1].b*(c & 7)) >> 3;
+        }
+
+        for (c = 0; c < 65536; c++)
+        {
+                int r = (c >> 8) & 0xf8;
+                int g = (c >> 3) & 0xfc;
+                int b = (c << 3) & 0xf8;
+//                r |= (r >> 5);
+//                g |= (g >> 6);
+//                b |= (b >> 5);
+
+                voodoo->video_16to32[c] = (voodoo->clutData256[r].r << 16) | (voodoo->clutData256[g].g << 8) | voodoo->clutData256[b].b;
+        }
+}
+
+
+
+#define FILTDIV 256
+
+static int FILTCAP, FILTCAPG, FILTCAPB = 0;    /* color filter threshold values */
+
+void voodoo_generate_filter_v1(voodoo_t *voodoo)
+{
+        int g, h;
+        float difference, diffg, diffb;
+        float thiscol, thiscolg, thiscolb, lined;
+       float fcr, fcg, fcb;
+
+       fcr = FILTCAP * 5;
+       fcg = FILTCAPG * 6;
+       fcb = FILTCAPB * 5;
+
+        for (g=0;g<FILTDIV;g++)         // pixel 1
+        {
+                for (h=0;h<FILTDIV;h++)      // pixel 2
+                {
+                        difference = (float)(h - g);
+                        diffg = difference;
+                        diffb = difference;
+
+                       thiscol = thiscolg = thiscolb = g;
+
+                        if (difference > FILTCAP)
+                                difference = FILTCAP;
+                        if (difference < -FILTCAP)
+                                difference = -FILTCAP;
+
+                        if (diffg > FILTCAPG)
+                                diffg = FILTCAPG;
+                        if (diffg < -FILTCAPG)
+                                diffg = -FILTCAPG;
+
+                        if (diffb > FILTCAPB)
+                                diffb = FILTCAPB;
+                        if (diffb < -FILTCAPB)
+                                diffb = -FILTCAPB;
+
+                       // hack - to make it not bleed onto black
+                       //if (g == 0){
+                       //difference = diffg = diffb = 0;
+                       //}
+
+                       if ((difference < fcr) || (-difference > -fcr))
+                               thiscol =  g + (difference / 2);
+                       if ((diffg < fcg) || (-diffg > -fcg))
+                               thiscolg =  g + (diffg / 2);            /* need these divides so we can actually undither! */
+                       if ((diffb < fcb) || (-diffb > -fcb))
+                               thiscolb =  g + (diffb / 2);
+
+                        if (thiscol < 0)
+                                thiscol = 0;
+                        if (thiscol > FILTDIV-1)
+                                thiscol = FILTDIV-1;
+
+                        if (thiscolg < 0)
+                                thiscolg = 0;
+                        if (thiscolg > FILTDIV-1)
+                                thiscolg = FILTDIV-1;
+
+                        if (thiscolb < 0)
+                                thiscolb = 0;
+                        if (thiscolb > FILTDIV-1)
+                                thiscolb = FILTDIV-1;
+
+                        voodoo->thefilter[g][h] = thiscol;
+                        voodoo->thefilterg[g][h] = thiscolg;
+                        voodoo->thefilterb[g][h] = thiscolb;
+                }
+
+                lined = g + 4;
+                if (lined > 255)
+                        lined = 255;
+                voodoo->purpleline[g][0] = lined;
+                voodoo->purpleline[g][2] = lined;
+
+                lined = g + 0;
+                if (lined > 255)
+                        lined = 255;
+                voodoo->purpleline[g][1] = lined;
+        }
+}
+
+void voodoo_generate_filter_v2(voodoo_t *voodoo)
+{
+        int g, h;
+        float difference;
+        float thiscol, thiscolg, thiscolb;
+       float clr, clg, clb = 0;
+       float fcr, fcg, fcb = 0;
+
+       // pre-clamping
+
+       fcr = FILTCAP;
+       fcg = FILTCAPG;
+       fcb = FILTCAPB;
+
+       if (fcr > 32) fcr = 32;
+       if (fcg > 32) fcg = 32;
+       if (fcb > 32) fcb = 32;
+
+        for (g=0;g<256;g++)            // pixel 1 - our target pixel we want to bleed into
+        {
+               for (h=0;h<256;h++)      // pixel 2 - our main pixel
+               {
+                       float avg;
+                       float avgdiff;
+
+                       difference = (float)(g - h);
+                       avg = (float)((g + g + g + g + h) / 5);
+                       avgdiff = avg - (float)((g + h + h + h + h) / 5);
+                       if (avgdiff < 0) avgdiff *= -1;
+                       if (difference < 0) difference *= -1;
+
+                       thiscol = thiscolg = thiscolb = g;
+
+                       // try lighten
+                       if (h > g)
+                       {
+                               clr = clg = clb = avgdiff;
+
+                               if (clr>fcr) clr=fcr;
+                                if (clg>fcg) clg=fcg;
+                               if (clb>fcb) clb=fcb;
+
+
+                               thiscol = g + clr;
+                               thiscolg = g + clg;
+                               thiscolb = g + clb;
+
+                               if (thiscol>g+FILTCAP)
+                                       thiscol=g+FILTCAP;
+                               if (thiscolg>g+FILTCAPG)
+                                       thiscolg=g+FILTCAPG;
+                               if (thiscolb>g+FILTCAPB)
+                                       thiscolb=g+FILTCAPB;
+
+
+                               if (thiscol>g+avgdiff)
+                                       thiscol=g+avgdiff;
+                               if (thiscolg>g+avgdiff)
+                                       thiscolg=g+avgdiff;
+                               if (thiscolb>g+avgdiff)
+                                       thiscolb=g+avgdiff;
+
+                       }
+
+                       if (difference > FILTCAP)
+                               thiscol = g;
+                       if (difference > FILTCAPG)
+                               thiscolg = g;
+                       if (difference > FILTCAPB)
+                               thiscolb = g;
+
+                       // clamp
+                       if (thiscol < 0) thiscol = 0;
+                       if (thiscolg < 0) thiscolg = 0;
+                       if (thiscolb < 0) thiscolb = 0;
+
+                       if (thiscol > 255) thiscol = 255;
+                       if (thiscolg > 255) thiscolg = 255;
+                       if (thiscolb > 255) thiscolb = 255;
+
+                       // add to the table
+                       voodoo->thefilter[g][h] = (thiscol);
+                       voodoo->thefilterg[g][h] = (thiscolg);
+                       voodoo->thefilterb[g][h] = (thiscolb);
+
+                       // debug the ones that don't give us much of a difference
+                       //if (difference < FILTCAP)
+                       //pclog("Voodoofilter: %ix%i - %f difference, %f average difference, R=%f, G=%f, B=%f\n", g, h, difference, avgdiff, thiscol, thiscolg, thiscolb);
+                }
+
+        }
+}
+
+void voodoo_threshold_check(voodoo_t *voodoo)
+{
+       int r, g, b;
+
+       if (!voodoo->scrfilterEnabled)
+               return; /* considered disabled; don't check and generate */
+
+       /* Check for changes, to generate anew table */
+       if (voodoo->scrfilterThreshold != voodoo->scrfilterThresholdOld)
+       {
+               r = (voodoo->scrfilterThreshold >> 16) & 0xFF;
+               g = (voodoo->scrfilterThreshold >> 8 ) & 0xFF;
+               b = voodoo->scrfilterThreshold & 0xFF;
+
+               FILTCAP = r;
+               FILTCAPG = g;
+               FILTCAPB = b;
+
+               pclog("Voodoo Filter Threshold Check: %06x - RED %i GREEN %i BLUE %i\n", voodoo->scrfilterThreshold, r, g, b);
+
+               voodoo->scrfilterThresholdOld = voodoo->scrfilterThreshold;
+
+               if (voodoo->type == VOODOO_2)
+                       voodoo_generate_filter_v2(voodoo);
+               else
+                       voodoo_generate_filter_v1(voodoo);
+
+               if (voodoo->type >= VOODOO_BANSHEE)
+                       voodoo_generate_vb_filters(voodoo, FILTCAP, FILTCAPG);
+       }
+}
+
+static void voodoo_filterline_v1(voodoo_t *voodoo, uint8_t *fil, int column, uint16_t *src, int line)
+{
+       int x;
+
+       // Scratchpad for avoiding feedback streaks
+        uint8_t fil3[(voodoo->h_disp) * 3];
+
+       /* 16 to 32-bit */
+        for (x=0; x<column;x++)
+        {
+               fil[x*3]        =       ((src[x] & 31) << 3);
+               fil[x*3+1]      =       (((src[x] >> 5) & 63) << 2);
+               fil[x*3+2]      =       (((src[x] >> 11) & 31) << 3);
+
+               // Copy to our scratchpads
+               fil3[x*3+0]     = fil[x*3+0];
+               fil3[x*3+1]     = fil[x*3+1];
+               fil3[x*3+2]     = fil[x*3+2];
+        }
+
+
+        /* lines */
+
+        if (line & 1)
+        {
+                for (x=0; x<column;x++)
+                {
+                        fil[x*3] = voodoo->purpleline[fil[x*3]][0];
+                        fil[x*3+1] = voodoo->purpleline[fil[x*3+1]][1];
+                        fil[x*3+2] = voodoo->purpleline[fil[x*3+2]][2];
+                }
+        }
+
+
+        /* filtering time */
+
+        for (x=1; x<column;x++)
+        {
+                fil3[(x)*3]   = voodoo->thefilterb[fil[x*3]][fil[      (x-1)           *3]];
+                fil3[(x)*3+1] = voodoo->thefilterg[fil[x*3+1]][fil[    (x-1)           *3+1]];
+                fil3[(x)*3+2] = voodoo->thefilter[fil[x*3+2]][fil[     (x-1)           *3+2]];
+        }
+
+        for (x=1; x<column;x++)
+        {
+                fil[(x)*3]   = voodoo->thefilterb[fil3[x*3]][fil3[     (x-1)           *3]];
+                fil[(x)*3+1] = voodoo->thefilterg[fil3[x*3+1]][fil3[   (x-1)           *3+1]];
+                fil[(x)*3+2] = voodoo->thefilter[fil3[x*3+2]][fil3[    (x-1)           *3+2]];
+        }
+
+        for (x=1; x<column;x++)
+        {
+                fil3[(x)*3]   = voodoo->thefilterb[fil[x*3]][fil[      (x-1)           *3]];
+                fil3[(x)*3+1] = voodoo->thefilterg[fil[x*3+1]][fil[    (x-1)           *3+1]];
+                fil3[(x)*3+2] = voodoo->thefilter[fil[x*3+2]][fil[     (x-1)           *3+2]];
+        }
+
+        for (x=0; x<column-1;x++)
+        {
+                fil[(x)*3]   = voodoo->thefilterb[fil3[x*3]][fil3[     (x+1)           *3]];
+                fil[(x)*3+1] = voodoo->thefilterg[fil3[x*3+1]][fil3[   (x+1)           *3+1]];
+                fil[(x)*3+2] = voodoo->thefilter[fil3[x*3+2]][fil3[    (x+1)           *3+2]];
+        }
+}
+
+
+static void voodoo_filterline_v2(voodoo_t *voodoo, uint8_t *fil, int column, uint16_t *src, int line)
+{
+       int x;
+
+       // Scratchpad for blending filter
+        uint8_t fil3[(voodoo->h_disp) * 3];
+
+       /* 16 to 32-bit */
+        for (x=0; x<column;x++)
+        {
+               // Blank scratchpads
+               fil3[x*3+0] = fil[x*3+0] =      ((src[x] & 31) << 3);
+               fil3[x*3+1] = fil[x*3+1] =      (((src[x] >> 5) & 63) << 2);
+               fil3[x*3+2] = fil[x*3+2] =      (((src[x] >> 11) & 31) << 3);
+        }
+
+        /* filtering time */
+
+       for (x=1; x<column-3;x++)
+        {
+               fil3[(x+3)*3]   = voodoo->thefilterb    [((src[x+3] & 31) << 3)]                [((src[x] & 31) << 3)];
+               fil3[(x+3)*3+1] = voodoo->thefilterg    [(((src[x+3] >> 5) & 63) << 2)]         [(((src[x] >> 5) & 63) << 2)];
+               fil3[(x+3)*3+2] = voodoo->thefilter     [(((src[x+3] >> 11) & 31) << 3)]        [(((src[x] >> 11) & 31) << 3)];
+
+               fil[(x+2)*3]   = voodoo->thefilterb     [fil3[(x+2)*3]][((src[x] & 31) << 3)];
+               fil[(x+2)*3+1] = voodoo->thefilterg     [fil3[(x+2)*3+1]][(((src[x] >> 5) & 63) << 2)];
+               fil[(x+2)*3+2] = voodoo->thefilter      [fil3[(x+2)*3+2]][(((src[x] >> 11) & 31) << 3)];
+
+               fil3[(x+1)*3]   = voodoo->thefilterb    [fil[(x+1)*3]][((src[x] & 31) << 3)];
+               fil3[(x+1)*3+1] = voodoo->thefilterg    [fil[(x+1)*3+1]][(((src[x] >> 5) & 63) << 2)];
+               fil3[(x+1)*3+2] = voodoo->thefilter     [fil[(x+1)*3+2]][(((src[x] >> 11) & 31) << 3)];
+
+               fil[(x-1)*3]   = voodoo->thefilterb     [fil3[(x-1)*3]][((src[x] & 31) << 3)];
+               fil[(x-1)*3+1] = voodoo->thefilterg     [fil3[(x-1)*3+1]][(((src[x] >> 5) & 63) << 2)];
+               fil[(x-1)*3+2] = voodoo->thefilter      [fil3[(x-1)*3+2]][(((src[x] >> 11) & 31) << 3)];
+        }
+
+       // unroll for edge cases
+
+       fil3[(column-3)*3]   = voodoo->thefilterb       [((src[column-3] & 31) << 3)]           [((src[column] & 31) << 3)];
+       fil3[(column-3)*3+1] = voodoo->thefilterg       [(((src[column-3] >> 5) & 63) << 2)]    [(((src[column] >> 5) & 63) << 2)];
+       fil3[(column-3)*3+2] = voodoo->thefilter        [(((src[column-3] >> 11) & 31) << 3)]   [(((src[column] >> 11) & 31) << 3)];
+
+       fil3[(column-2)*3]   = voodoo->thefilterb       [((src[column-2] & 31) << 3)]           [((src[column] & 31) << 3)];
+       fil3[(column-2)*3+1] = voodoo->thefilterg       [(((src[column-2] >> 5) & 63) << 2)]    [(((src[column] >> 5) & 63) << 2)];
+       fil3[(column-2)*3+2] = voodoo->thefilter        [(((src[column-2] >> 11) & 31) << 3)]   [(((src[column] >> 11) & 31) << 3)];
+
+       fil3[(column-1)*3]   = voodoo->thefilterb       [((src[column-1] & 31) << 3)]           [((src[column] & 31) << 3)];
+       fil3[(column-1)*3+1] = voodoo->thefilterg       [(((src[column-1] >> 5) & 63) << 2)]    [(((src[column] >> 5) & 63) << 2)];
+       fil3[(column-1)*3+2] = voodoo->thefilter        [(((src[column-1] >> 11) & 31) << 3)]   [(((src[column] >> 11) & 31) << 3)];
+
+       fil[(column-2)*3]   = voodoo->thefilterb        [fil3[(column-2)*3]][((src[column] & 31) << 3)];
+       fil[(column-2)*3+1] = voodoo->thefilterg        [fil3[(column-2)*3+1]][(((src[column] >> 5) & 63) << 2)];
+       fil[(column-2)*3+2] = voodoo->thefilter         [fil3[(column-2)*3+2]][(((src[column] >> 11) & 31) << 3)];
+
+       fil[(column-1)*3]   = voodoo->thefilterb        [fil3[(column-1)*3]][((src[column] & 31) << 3)];
+       fil[(column-1)*3+1] = voodoo->thefilterg        [fil3[(column-1)*3+1]][(((src[column] >> 5) & 63) << 2)];
+       fil[(column-1)*3+2] = voodoo->thefilter         [fil3[(column-1)*3+2]][(((src[column] >> 11) & 31) << 3)];
+
+       fil3[(column-1)*3]   = voodoo->thefilterb       [fil[(column-1)*3]][((src[column] & 31) << 3)];
+       fil3[(column-1)*3+1] = voodoo->thefilterg       [fil[(column-1)*3+1]][(((src[column] >> 5) & 63) << 2)];
+       fil3[(column-1)*3+2] = voodoo->thefilter        [fil[(column-1)*3+2]][(((src[column] >> 11) & 31) << 3)];
+}
+
+void voodoo_callback(void *p)
+{
+        voodoo_t *voodoo = (voodoo_t *)p;
+
+        if (voodoo->fbiInit0 & FBIINIT0_VGA_PASS)
+        {
+                if (voodoo->line < voodoo->v_disp)
+                {
+                        voodoo_t *draw_voodoo;
+                        int draw_line;
+
+                        if (SLI_ENABLED)
+                        {
+                                if (voodoo == voodoo->set->voodoos[1])
+                                        goto skip_draw;
+
+                                if (((voodoo->initEnable & INITENABLE_SLI_MASTER_SLAVE) ? 1 : 0) == (voodoo->line & 1))
+                                        draw_voodoo = voodoo;
+                                else
+                                        draw_voodoo = voodoo->set->voodoos[1];
+                                draw_line = voodoo->line >> 1;
+                        }
+                        else
+                        {
+                                if (!(voodoo->fbiInit0 & 1))
+                                        goto skip_draw;
+                                draw_voodoo = voodoo;
+                                draw_line = voodoo->line;
+                        }
+
+                        if (draw_voodoo->dirty_line[draw_line])
+                        {
+                                uint32_t *p = &((uint32_t *)buffer32->line[voodoo->line])[32];
+                                uint16_t *src = (uint16_t *)&draw_voodoo->fb_mem[draw_voodoo->front_offset + draw_line*draw_voodoo->row_width];
+                                int x;
+
+                                draw_voodoo->dirty_line[draw_line] = 0;
+
+                                if (voodoo->line < voodoo->dirty_line_low)
+                                {
+                                        voodoo->dirty_line_low = voodoo->line;
+                                        video_wait_for_buffer();
+                                }
+                                if (voodoo->line > voodoo->dirty_line_high)
+                                        voodoo->dirty_line_high = voodoo->line;
+
+                                if (voodoo->scrfilter && voodoo->scrfilterEnabled)
+                                {
+                                        uint8_t fil[(voodoo->h_disp) * 3];              /* interleaved 24-bit RGB */
+
+                                       if (voodoo->type == VOODOO_2)
+                                               voodoo_filterline_v2(voodoo, fil, voodoo->h_disp, src, voodoo->line);
+                                       else
+                                               voodoo_filterline_v1(voodoo, fil, voodoo->h_disp, src, voodoo->line);
+
+                                        for (x = 0; x < voodoo->h_disp; x++)
+                                        {
+                                                p[x] = (voodoo->clutData256[fil[x*3]].b << 0 | voodoo->clutData256[fil[x*3+1]].g << 8 | voodoo->clutData256[fil[x*3+2]].r << 16);
+                                        }
+                                }
+                                else
+                                {
+                                        for (x = 0; x < voodoo->h_disp; x++)
+                                        {
+                                                p[x] = draw_voodoo->video_16to32[src[x]];
+                                        }
+                                }
+                        }
+                }
+        }
+skip_draw:
+        if (voodoo->line == voodoo->v_disp)
+        {
+//                pclog("retrace %i %i %08x %i\n", voodoo->retrace_count, voodoo->swap_interval, voodoo->swap_offset, voodoo->swap_pending);
+                voodoo->retrace_count++;
+                if (SLI_ENABLED && (voodoo->fbiInit2 & FBIINIT2_SWAP_ALGORITHM_MASK) == FBIINIT2_SWAP_ALGORITHM_SLI_SYNC)
+                {
+                        if (voodoo == voodoo->set->voodoos[0])
+                        {
+                                voodoo_t *voodoo_1 = voodoo->set->voodoos[1];
+
+                                thread_lock_mutex(voodoo->swap_mutex);
+                                /*Only swap if both Voodoos are waiting for buffer swap*/
+                                if (voodoo->swap_pending && (voodoo->retrace_count > voodoo->swap_interval) &&
+                                    voodoo_1->swap_pending && (voodoo_1->retrace_count > voodoo_1->swap_interval))
+                                {
+                                        memset(voodoo->dirty_line, 1, 1024);
+                                        voodoo->retrace_count = 0;
+                                        voodoo->front_offset = voodoo->swap_offset;
+                                        if (voodoo->swap_count > 0)
+                                                voodoo->swap_count--;
+                                        voodoo->swap_pending = 0;
+
+                                        memset(voodoo_1->dirty_line, 1, 1024);
+                                        voodoo_1->retrace_count = 0;
+                                        voodoo_1->front_offset = voodoo_1->swap_offset;
+                                        if (voodoo_1->swap_count > 0)
+                                                voodoo_1->swap_count--;
+                                        voodoo_1->swap_pending = 0;
+                                        thread_unlock_mutex(voodoo->swap_mutex);
+
+                                        thread_set_event(voodoo->wake_fifo_thread);
+                                        thread_set_event(voodoo_1->wake_fifo_thread);
+
+                                        voodoo->frame_count++;
+                                        voodoo_1->frame_count++;
+                                }
+                                else
+                                        thread_unlock_mutex(voodoo->swap_mutex);
+                        }
+                }
+                else
+                {
+                        thread_lock_mutex(voodoo->swap_mutex);
+                        if (voodoo->swap_pending && (voodoo->retrace_count > voodoo->swap_interval))
+                        {
+                                voodoo->front_offset = voodoo->swap_offset;
+                                if (voodoo->swap_count > 0)
+                                        voodoo->swap_count--;
+                                voodoo->swap_pending = 0;
+                                thread_unlock_mutex(voodoo->swap_mutex);
+
+                                memset(voodoo->dirty_line, 1, 1024);
+                                voodoo->retrace_count = 0;
+                                thread_set_event(voodoo->wake_fifo_thread);
+                                voodoo->frame_count++;
+                        }
+                        else
+                                thread_unlock_mutex(voodoo->swap_mutex);
+                }
+                voodoo->v_retrace = 1;
+        }
+        voodoo->line++;
+
+        if (voodoo->fbiInit0 & FBIINIT0_VGA_PASS)
+        {
+                if (voodoo->line == voodoo->v_disp)
+                {
+                        if (voodoo->dirty_line_high > voodoo->dirty_line_low)
+                                svga_doblit(0, voodoo->v_disp, voodoo->h_disp, voodoo->v_disp-1, voodoo->svga);
+                        if (voodoo->clutData_dirty)
+                        {
+                                voodoo->clutData_dirty = 0;
+                                voodoo_calc_clutData(voodoo);
+                        }
+                        voodoo->dirty_line_high = -1;
+                        voodoo->dirty_line_low = 2000;
+                }
+        }
+
+        if (voodoo->line >= voodoo->v_total)
+        {
+                voodoo->line = 0;
+                voodoo->v_retrace = 0;
+        }
+        if (voodoo->line_time)
+               timer_advance_u64(&voodoo->timer, voodoo->line_time);
+        else
+               timer_advance_u64(&voodoo->timer, TIMER_USEC * 32);
+}
diff --git a/pcem/vid_voodoo_display.h b/pcem/vid_voodoo_display.h
new file mode 100644 (file)
index 0000000..46cbc00
--- /dev/null
@@ -0,0 +1,6 @@
+void voodoo_update_ncc(voodoo_t *voodoo, int tmu);
+void voodoo_pixelclock_update(voodoo_t *voodoo);
+void voodoo_generate_filter_v1(voodoo_t *voodoo);
+void voodoo_generate_filter_v2(voodoo_t *voodoo);
+void voodoo_threshold_check(voodoo_t *voodoo);
+void voodoo_callback(void *p);
diff --git a/pcem/vid_voodoo_dither.h b/pcem/vid_voodoo_dither.h
new file mode 100644 (file)
index 0000000..67d4838
--- /dev/null
@@ -0,0 +1,5136 @@
+static const uint8_t dither_rb[256][4][4] =
+{
+       {
+               {0, 0, 0, 0},
+               {0, 0, 0, 0},
+               {0, 0, 0, 0},
+               {0, 0, 0, 0},
+       },
+       {
+               {0, 0, 0, 0},
+               {0, 0, 1, 0},
+               {0, 0, 0, 0},
+               {1, 0, 0, 0},
+       },
+       {
+               {0, 0, 0, 0},
+               {1, 0, 1, 0},
+               {0, 0, 0, 0},
+               {1, 0, 1, 0},
+       },
+       {
+               {0, 0, 0, 1},
+               {1, 0, 1, 0},
+               {0, 1, 0, 0},
+               {1, 0, 1, 0},
+       },
+       {
+               {0, 1, 0, 1},
+               {1, 0, 1, 0},
+               {0, 1, 0, 1},
+               {1, 0, 1, 0},
+       },
+       {
+               {0, 1, 0, 1},
+               {1, 0, 1, 1},
+               {0, 1, 0, 1},
+               {1, 1, 1, 0},
+       },
+       {
+               {0, 1, 0, 1},
+               {1, 1, 1, 1},
+               {0, 1, 0, 1},
+               {1, 1, 1, 1},
+       },
+       {
+               {0, 1, 1, 1},
+               {1, 1, 1, 1},
+               {1, 1, 0, 1},
+               {1, 1, 1, 1},
+       },
+       {
+               {1, 1, 1, 1},
+               {1, 1, 1, 1},
+               {1, 1, 1, 1},
+               {1, 1, 1, 1},
+       },
+       {
+               {1, 1, 1, 1},
+               {1, 1, 2, 1},
+               {1, 1, 1, 1},
+               {2, 1, 1, 1},
+       },
+       {
+               {1, 1, 1, 1},
+               {2, 1, 2, 1},
+               {1, 1, 1, 1},
+               {2, 1, 2, 1},
+       },
+       {
+               {1, 1, 1, 2},
+               {2, 1, 2, 1},
+               {1, 2, 1, 1},
+               {2, 1, 2, 1},
+       },
+       {
+               {1, 2, 1, 2},
+               {2, 1, 2, 1},
+               {1, 2, 1, 2},
+               {2, 1, 2, 1},
+       },
+       {
+               {1, 2, 1, 2},
+               {2, 1, 2, 2},
+               {1, 2, 1, 2},
+               {2, 2, 2, 1},
+       },
+       {
+               {1, 2, 1, 2},
+               {2, 2, 2, 2},
+               {1, 2, 1, 2},
+               {2, 2, 2, 2},
+       },
+       {
+               {1, 2, 2, 2},
+               {2, 2, 2, 2},
+               {2, 2, 1, 2},
+               {2, 2, 2, 2},
+       },
+       {
+               {1, 2, 2, 2},
+               {2, 2, 2, 2},
+               {2, 2, 2, 2},
+               {2, 2, 2, 2},
+       },
+       {
+               {2, 2, 2, 2},
+               {2, 2, 2, 2},
+               {2, 2, 2, 2},
+               {3, 2, 2, 2},
+       },
+       {
+               {2, 2, 2, 2},
+               {2, 2, 3, 2},
+               {2, 2, 2, 2},
+               {3, 2, 3, 2},
+       },
+       {
+               {2, 2, 2, 2},
+               {3, 2, 3, 2},
+               {2, 3, 2, 2},
+               {3, 2, 3, 2},
+       },
+       {
+               {2, 2, 2, 3},
+               {3, 2, 3, 2},
+               {2, 3, 2, 3},
+               {3, 2, 3, 2},
+       },
+       {
+               {2, 3, 2, 3},
+               {3, 2, 3, 2},
+               {2, 3, 2, 3},
+               {3, 3, 3, 2},
+       },
+       {
+               {2, 3, 2, 3},
+               {3, 2, 3, 3},
+               {2, 3, 2, 3},
+               {3, 3, 3, 3},
+       },
+       {
+               {2, 3, 2, 3},
+               {3, 3, 3, 3},
+               {3, 3, 2, 3},
+               {3, 3, 3, 3},
+       },
+       {
+               {2, 3, 3, 3},
+               {3, 3, 3, 3},
+               {3, 3, 3, 3},
+               {3, 3, 3, 3},
+       },
+       {
+               {3, 3, 3, 3},
+               {3, 3, 3, 3},
+               {3, 3, 3, 3},
+               {4, 3, 3, 3},
+       },
+       {
+               {3, 3, 3, 3},
+               {3, 3, 4, 3},
+               {3, 3, 3, 3},
+               {4, 3, 4, 3},
+       },
+       {
+               {3, 3, 3, 3},
+               {4, 3, 4, 3},
+               {3, 4, 3, 3},
+               {4, 3, 4, 3},
+       },
+       {
+               {3, 3, 3, 4},
+               {4, 3, 4, 3},
+               {3, 4, 3, 4},
+               {4, 3, 4, 3},
+       },
+       {
+               {3, 4, 3, 4},
+               {4, 3, 4, 3},
+               {3, 4, 3, 4},
+               {4, 4, 4, 3},
+       },
+       {
+               {3, 4, 3, 4},
+               {4, 3, 4, 4},
+               {3, 4, 3, 4},
+               {4, 4, 4, 4},
+       },
+       {
+               {3, 4, 3, 4},
+               {4, 4, 4, 4},
+               {4, 4, 3, 4},
+               {4, 4, 4, 4},
+       },
+       {
+               {3, 4, 4, 4},
+               {4, 4, 4, 4},
+               {4, 4, 3, 4},
+               {4, 4, 4, 4},
+       },
+       {
+               {4, 4, 4, 4},
+               {4, 4, 4, 4},
+               {4, 4, 4, 4},
+               {4, 4, 4, 4},
+       },
+       {
+               {4, 4, 4, 4},
+               {4, 4, 5, 4},
+               {4, 4, 4, 4},
+               {5, 4, 4, 4},
+       },
+       {
+               {4, 4, 4, 4},
+               {5, 4, 5, 4},
+               {4, 4, 4, 4},
+               {5, 4, 5, 4},
+       },
+       {
+               {4, 4, 4, 5},
+               {5, 4, 5, 4},
+               {4, 5, 4, 4},
+               {5, 4, 5, 4},
+       },
+       {
+               {4, 5, 4, 5},
+               {5, 4, 5, 4},
+               {4, 5, 4, 5},
+               {5, 4, 5, 4},
+       },
+       {
+               {4, 5, 4, 5},
+               {5, 4, 5, 5},
+               {4, 5, 4, 5},
+               {5, 5, 5, 4},
+       },
+       {
+               {4, 5, 4, 5},
+               {5, 5, 5, 5},
+               {4, 5, 4, 5},
+               {5, 5, 5, 5},
+       },
+       {
+               {4, 5, 5, 5},
+               {5, 5, 5, 5},
+               {5, 5, 4, 5},
+               {5, 5, 5, 5},
+       },
+       {
+               {5, 5, 5, 5},
+               {5, 5, 5, 5},
+               {5, 5, 5, 5},
+               {5, 5, 5, 5},
+       },
+       {
+               {5, 5, 5, 5},
+               {5, 5, 6, 5},
+               {5, 5, 5, 5},
+               {6, 5, 5, 5},
+       },
+       {
+               {5, 5, 5, 5},
+               {6, 5, 6, 5},
+               {5, 5, 5, 5},
+               {6, 5, 6, 5},
+       },
+       {
+               {5, 5, 5, 6},
+               {6, 5, 6, 5},
+               {5, 6, 5, 5},
+               {6, 5, 6, 5},
+       },
+       {
+               {5, 6, 5, 6},
+               {6, 5, 6, 5},
+               {5, 6, 5, 6},
+               {6, 5, 6, 5},
+       },
+       {
+               {5, 6, 5, 6},
+               {6, 5, 6, 6},
+               {5, 6, 5, 6},
+               {6, 6, 6, 5},
+       },
+       {
+               {5, 6, 5, 6},
+               {6, 6, 6, 6},
+               {5, 6, 5, 6},
+               {6, 6, 6, 6},
+       },
+       {
+               {5, 6, 5, 6},
+               {6, 6, 6, 6},
+               {6, 6, 5, 6},
+               {6, 6, 6, 6},
+       },
+       {
+               {5, 6, 6, 6},
+               {6, 6, 6, 6},
+               {6, 6, 6, 6},
+               {6, 6, 6, 6},
+       },
+       {
+               {6, 6, 6, 6},
+               {6, 6, 6, 6},
+               {6, 6, 6, 6},
+               {7, 6, 6, 6},
+       },
+       {
+               {6, 6, 6, 6},
+               {6, 6, 7, 6},
+               {6, 6, 6, 6},
+               {7, 6, 7, 6},
+       },
+       {
+               {6, 6, 6, 6},
+               {7, 6, 7, 6},
+               {6, 7, 6, 6},
+               {7, 6, 7, 6},
+       },
+       {
+               {6, 6, 6, 7},
+               {7, 6, 7, 6},
+               {6, 7, 6, 7},
+               {7, 6, 7, 6},
+       },
+       {
+               {6, 7, 6, 7},
+               {7, 6, 7, 6},
+               {6, 7, 6, 7},
+               {7, 7, 7, 6},
+       },
+       {
+               {6, 7, 6, 7},
+               {7, 6, 7, 7},
+               {6, 7, 6, 7},
+               {7, 7, 7, 7},
+       },
+       {
+               {6, 7, 6, 7},
+               {7, 7, 7, 7},
+               {7, 7, 6, 7},
+               {7, 7, 7, 7},
+       },
+       {
+               {6, 7, 7, 7},
+               {7, 7, 7, 7},
+               {7, 7, 7, 7},
+               {7, 7, 7, 7},
+       },
+       {
+               {7, 7, 7, 7},
+               {7, 7, 7, 7},
+               {7, 7, 7, 7},
+               {8, 7, 7, 7},
+       },
+       {
+               {7, 7, 7, 7},
+               {7, 7, 8, 7},
+               {7, 7, 7, 7},
+               {8, 7, 8, 7},
+       },
+       {
+               {7, 7, 7, 7},
+               {8, 7, 8, 7},
+               {7, 8, 7, 7},
+               {8, 7, 8, 7},
+       },
+       {
+               {7, 7, 7, 8},
+               {8, 7, 8, 7},
+               {7, 8, 7, 8},
+               {8, 7, 8, 7},
+       },
+       {
+               {7, 8, 7, 8},
+               {8, 7, 8, 7},
+               {7, 8, 7, 8},
+               {8, 8, 8, 7},
+       },
+       {
+               {7, 8, 7, 8},
+               {8, 7, 8, 8},
+               {7, 8, 7, 8},
+               {8, 8, 8, 8},
+       },
+       {
+               {7, 8, 7, 8},
+               {8, 8, 8, 8},
+               {7, 8, 7, 8},
+               {8, 8, 8, 8},
+       },
+       {
+               {7, 8, 8, 8},
+               {8, 8, 8, 8},
+               {8, 8, 7, 8},
+               {8, 8, 8, 8},
+       },
+       {
+               {8, 8, 8, 8},
+               {8, 8, 8, 8},
+               {8, 8, 8, 8},
+               {8, 8, 8, 8},
+       },
+       {
+               {8, 8, 8, 8},
+               {8, 8, 9, 8},
+               {8, 8, 8, 8},
+               {9, 8, 8, 8},
+       },
+       {
+               {8, 8, 8, 8},
+               {9, 8, 9, 8},
+               {8, 8, 8, 8},
+               {9, 8, 9, 8},
+       },
+       {
+               {8, 8, 8, 9},
+               {9, 8, 9, 8},
+               {8, 9, 8, 8},
+               {9, 8, 9, 8},
+       },
+       {
+               {8, 9, 8, 9},
+               {9, 8, 9, 8},
+               {8, 9, 8, 9},
+               {9, 8, 9, 8},
+       },
+       {
+               {8, 9, 8, 9},
+               {9, 8, 9, 9},
+               {8, 9, 8, 9},
+               {9, 9, 9, 8},
+       },
+       {
+               {8, 9, 8, 9},
+               {9, 9, 9, 9},
+               {8, 9, 8, 9},
+               {9, 9, 9, 9},
+       },
+       {
+               {8, 9, 9, 9},
+               {9, 9, 9, 9},
+               {9, 9, 8, 9},
+               {9, 9, 9, 9},
+       },
+       {
+               {9, 9, 9, 9},
+               {9, 9, 9, 9},
+               {9, 9, 9, 9},
+               {9, 9, 9, 9},
+       },
+       {
+               {9, 9, 9, 9},
+               {9, 9, 10, 9},
+               {9, 9, 9, 9},
+               {10, 9, 9, 9},
+       },
+       {
+               {9, 9, 9, 9},
+               {10, 9, 10, 9},
+               {9, 9, 9, 9},
+               {10, 9, 10, 9},
+       },
+       {
+               {9, 9, 9, 10},
+               {10, 9, 10, 9},
+               {9, 10, 9, 9},
+               {10, 9, 10, 9},
+       },
+       {
+               {9, 10, 9, 10},
+               {10, 9, 10, 9},
+               {9, 10, 9, 10},
+               {10, 9, 10, 9},
+       },
+       {
+               {9, 10, 9, 10},
+               {10, 9, 10, 10},
+               {9, 10, 9, 10},
+               {10, 10, 10, 9},
+       },
+       {
+               {9, 10, 9, 10},
+               {10, 9, 10, 10},
+               {9, 10, 9, 10},
+               {10, 10, 10, 10},
+       },
+       {
+               {9, 10, 9, 10},
+               {10, 10, 10, 10},
+               {10, 10, 9, 10},
+               {10, 10, 10, 10},
+       },
+       {
+               {9, 10, 10, 10},
+               {10, 10, 10, 10},
+               {10, 10, 10, 10},
+               {10, 10, 10, 10},
+       },
+       {
+               {10, 10, 10, 10},
+               {10, 10, 10, 10},
+               {10, 10, 10, 10},
+               {11, 10, 10, 10},
+       },
+       {
+               {10, 10, 10, 10},
+               {10, 10, 11, 10},
+               {10, 10, 10, 10},
+               {11, 10, 11, 10},
+       },
+       {
+               {10, 10, 10, 10},
+               {11, 10, 11, 10},
+               {10, 11, 10, 10},
+               {11, 10, 11, 10},
+       },
+       {
+               {10, 10, 10, 11},
+               {11, 10, 11, 10},
+               {10, 11, 10, 11},
+               {11, 10, 11, 10},
+       },
+       {
+               {10, 11, 10, 11},
+               {11, 10, 11, 10},
+               {10, 11, 10, 11},
+               {11, 11, 11, 10},
+       },
+       {
+               {10, 11, 10, 11},
+               {11, 10, 11, 11},
+               {10, 11, 10, 11},
+               {11, 11, 11, 11},
+       },
+       {
+               {10, 11, 10, 11},
+               {11, 11, 11, 11},
+               {11, 11, 10, 11},
+               {11, 11, 11, 11},
+       },
+       {
+               {10, 11, 11, 11},
+               {11, 11, 11, 11},
+               {11, 11, 11, 11},
+               {11, 11, 11, 11},
+       },
+       {
+               {11, 11, 11, 11},
+               {11, 11, 11, 11},
+               {11, 11, 11, 11},
+               {12, 11, 11, 11},
+       },
+       {
+               {11, 11, 11, 11},
+               {11, 11, 12, 11},
+               {11, 11, 11, 11},
+               {12, 11, 12, 11},
+       },
+       {
+               {11, 11, 11, 11},
+               {12, 11, 12, 11},
+               {11, 12, 11, 11},
+               {12, 11, 12, 11},
+       },
+       {
+               {11, 11, 11, 12},
+               {12, 11, 12, 11},
+               {11, 12, 11, 12},
+               {12, 11, 12, 11},
+       },
+       {
+               {11, 12, 11, 12},
+               {12, 11, 12, 11},
+               {11, 12, 11, 12},
+               {12, 12, 12, 11},
+       },
+       {
+               {11, 12, 11, 12},
+               {12, 11, 12, 12},
+               {11, 12, 11, 12},
+               {12, 12, 12, 11},
+       },
+       {
+               {11, 12, 11, 12},
+               {12, 12, 12, 12},
+               {11, 12, 11, 12},
+               {12, 12, 12, 12},
+       },
+       {
+               {11, 12, 12, 12},
+               {12, 12, 12, 12},
+               {12, 12, 11, 12},
+               {12, 12, 12, 12},
+       },
+       {
+               {12, 12, 12, 12},
+               {12, 12, 12, 12},
+               {12, 12, 12, 12},
+               {12, 12, 12, 12},
+       },
+       {
+               {12, 12, 12, 12},
+               {12, 12, 13, 12},
+               {12, 12, 12, 12},
+               {13, 12, 12, 12},
+       },
+       {
+               {12, 12, 12, 12},
+               {13, 12, 13, 12},
+               {12, 12, 12, 12},
+               {13, 12, 13, 12},
+       },
+       {
+               {12, 12, 12, 13},
+               {13, 12, 13, 12},
+               {12, 13, 12, 12},
+               {13, 12, 13, 12},
+       },
+       {
+               {12, 13, 12, 13},
+               {13, 12, 13, 12},
+               {12, 13, 12, 13},
+               {13, 12, 13, 12},
+       },
+       {
+               {12, 13, 12, 13},
+               {13, 12, 13, 13},
+               {12, 13, 12, 13},
+               {13, 13, 13, 12},
+       },
+       {
+               {12, 13, 12, 13},
+               {13, 13, 13, 13},
+               {12, 13, 12, 13},
+               {13, 13, 13, 13},
+       },
+       {
+               {12, 13, 13, 13},
+               {13, 13, 13, 13},
+               {13, 13, 12, 13},
+               {13, 13, 13, 13},
+       },
+       {
+               {13, 13, 13, 13},
+               {13, 13, 13, 13},
+               {13, 13, 13, 13},
+               {13, 13, 13, 13},
+       },
+       {
+               {13, 13, 13, 13},
+               {13, 13, 14, 13},
+               {13, 13, 13, 13},
+               {14, 13, 13, 13},
+       },
+       {
+               {13, 13, 13, 13},
+               {14, 13, 14, 13},
+               {13, 13, 13, 13},
+               {14, 13, 14, 13},
+       },
+       {
+               {13, 13, 13, 14},
+               {14, 13, 14, 13},
+               {13, 14, 13, 13},
+               {14, 13, 14, 13},
+       },
+       {
+               {13, 14, 13, 14},
+               {14, 13, 14, 13},
+               {13, 14, 13, 14},
+               {14, 13, 14, 13},
+       },
+       {
+               {13, 14, 13, 14},
+               {14, 13, 14, 13},
+               {13, 14, 13, 14},
+               {14, 14, 14, 13},
+       },
+       {
+               {13, 14, 13, 14},
+               {14, 13, 14, 14},
+               {13, 14, 13, 14},
+               {14, 14, 14, 14},
+       },
+       {
+               {13, 14, 13, 14},
+               {14, 14, 14, 14},
+               {14, 14, 13, 14},
+               {14, 14, 14, 14},
+       },
+       {
+               {13, 14, 14, 14},
+               {14, 14, 14, 14},
+               {14, 14, 14, 14},
+               {14, 14, 14, 14},
+       },
+       {
+               {14, 14, 14, 14},
+               {14, 14, 14, 14},
+               {14, 14, 14, 14},
+               {15, 14, 14, 14},
+       },
+       {
+               {14, 14, 14, 14},
+               {14, 14, 15, 14},
+               {14, 14, 14, 14},
+               {15, 14, 15, 14},
+       },
+       {
+               {14, 14, 14, 14},
+               {15, 14, 15, 14},
+               {14, 15, 14, 14},
+               {15, 14, 15, 14},
+       },
+       {
+               {14, 14, 14, 15},
+               {15, 14, 15, 14},
+               {14, 15, 14, 15},
+               {15, 14, 15, 14},
+       },
+       {
+               {14, 15, 14, 15},
+               {15, 14, 15, 14},
+               {14, 15, 14, 15},
+               {15, 15, 15, 14},
+       },
+       {
+               {14, 15, 14, 15},
+               {15, 14, 15, 15},
+               {14, 15, 14, 15},
+               {15, 15, 15, 15},
+       },
+       {
+               {14, 15, 14, 15},
+               {15, 15, 15, 15},
+               {15, 15, 14, 15},
+               {15, 15, 15, 15},
+       },
+       {
+               {14, 15, 15, 15},
+               {15, 15, 15, 15},
+               {15, 15, 15, 15},
+               {15, 15, 15, 15},
+       },
+       {
+               {15, 15, 15, 15},
+               {15, 15, 15, 15},
+               {15, 15, 15, 15},
+               {16, 15, 15, 15},
+       },
+       {
+               {15, 15, 15, 15},
+               {15, 15, 16, 15},
+               {15, 15, 15, 15},
+               {16, 15, 16, 15},
+       },
+       {
+               {15, 15, 15, 15},
+               {16, 15, 16, 15},
+               {15, 16, 15, 15},
+               {16, 15, 16, 15},
+       },
+       {
+               {15, 15, 15, 16},
+               {16, 15, 16, 15},
+               {15, 16, 15, 16},
+               {16, 15, 16, 15},
+       },
+       {
+               {15, 16, 15, 16},
+               {16, 15, 16, 15},
+               {15, 16, 15, 16},
+               {16, 16, 16, 15},
+       },
+       {
+               {15, 16, 15, 16},
+               {16, 15, 16, 16},
+               {15, 16, 15, 16},
+               {16, 16, 16, 16},
+       },
+       {
+               {15, 16, 15, 16},
+               {16, 16, 16, 16},
+               {16, 16, 15, 16},
+               {16, 16, 16, 16},
+       },
+       {
+               {15, 16, 16, 16},
+               {16, 16, 16, 16},
+               {16, 16, 16, 16},
+               {16, 16, 16, 16},
+       },
+       {
+               {16, 16, 16, 16},
+               {16, 16, 16, 16},
+               {16, 16, 16, 16},
+               {17, 16, 16, 16},
+       },
+       {
+               {16, 16, 16, 16},
+               {16, 16, 17, 16},
+               {16, 16, 16, 16},
+               {17, 16, 17, 16},
+       },
+       {
+               {16, 16, 16, 16},
+               {17, 16, 17, 16},
+               {16, 17, 16, 16},
+               {17, 16, 17, 16},
+       },
+       {
+               {16, 16, 16, 17},
+               {17, 16, 17, 16},
+               {16, 17, 16, 17},
+               {17, 16, 17, 16},
+       },
+       {
+               {16, 17, 16, 17},
+               {17, 16, 17, 16},
+               {16, 17, 16, 17},
+               {17, 17, 17, 16},
+       },
+       {
+               {16, 17, 16, 17},
+               {17, 16, 17, 17},
+               {16, 17, 16, 17},
+               {17, 17, 17, 17},
+       },
+       {
+               {16, 17, 16, 17},
+               {17, 17, 17, 17},
+               {17, 17, 16, 17},
+               {17, 17, 17, 17},
+       },
+       {
+               {16, 17, 17, 17},
+               {17, 17, 17, 17},
+               {17, 17, 17, 17},
+               {17, 17, 17, 17},
+       },
+       {
+               {17, 17, 17, 17},
+               {17, 17, 17, 17},
+               {17, 17, 17, 17},
+               {18, 17, 17, 17},
+       },
+       {
+               {17, 17, 17, 17},
+               {17, 17, 18, 17},
+               {17, 17, 17, 17},
+               {18, 17, 18, 17},
+       },
+       {
+               {17, 17, 17, 17},
+               {18, 17, 18, 17},
+               {17, 18, 17, 17},
+               {18, 17, 18, 17},
+       },
+       {
+               {17, 17, 17, 18},
+               {18, 17, 18, 17},
+               {17, 18, 17, 18},
+               {18, 17, 18, 17},
+       },
+       {
+               {17, 18, 17, 18},
+               {18, 17, 18, 17},
+               {17, 18, 17, 18},
+               {18, 17, 18, 17},
+       },
+       {
+               {17, 18, 17, 18},
+               {18, 17, 18, 18},
+               {17, 18, 17, 18},
+               {18, 18, 18, 17},
+       },
+       {
+               {17, 18, 17, 18},
+               {18, 18, 18, 18},
+               {17, 18, 17, 18},
+               {18, 18, 18, 18},
+       },
+       {
+               {17, 18, 18, 18},
+               {18, 18, 18, 18},
+               {18, 18, 17, 18},
+               {18, 18, 18, 18},
+       },
+       {
+               {18, 18, 18, 18},
+               {18, 18, 18, 18},
+               {18, 18, 18, 18},
+               {18, 18, 18, 18},
+       },
+       {
+               {18, 18, 18, 18},
+               {18, 18, 19, 18},
+               {18, 18, 18, 18},
+               {19, 18, 18, 18},
+       },
+       {
+               {18, 18, 18, 18},
+               {19, 18, 19, 18},
+               {18, 18, 18, 18},
+               {19, 18, 19, 18},
+       },
+       {
+               {18, 18, 18, 19},
+               {19, 18, 19, 18},
+               {18, 19, 18, 18},
+               {19, 18, 19, 18},
+       },
+       {
+               {18, 19, 18, 19},
+               {19, 18, 19, 18},
+               {18, 19, 18, 19},
+               {19, 18, 19, 18},
+       },
+       {
+               {18, 19, 18, 19},
+               {19, 18, 19, 19},
+               {18, 19, 18, 19},
+               {19, 19, 19, 18},
+       },
+       {
+               {18, 19, 18, 19},
+               {19, 19, 19, 19},
+               {18, 19, 18, 19},
+               {19, 19, 19, 19},
+       },
+       {
+               {18, 19, 19, 19},
+               {19, 19, 19, 19},
+               {19, 19, 18, 19},
+               {19, 19, 19, 19},
+       },
+       {
+               {19, 19, 19, 19},
+               {19, 19, 19, 19},
+               {19, 19, 19, 19},
+               {19, 19, 19, 19},
+       },
+       {
+               {19, 19, 19, 19},
+               {19, 19, 20, 19},
+               {19, 19, 19, 19},
+               {20, 19, 19, 19},
+       },
+       {
+               {19, 19, 19, 19},
+               {20, 19, 20, 19},
+               {19, 19, 19, 19},
+               {20, 19, 20, 19},
+       },
+       {
+               {19, 19, 19, 20},
+               {20, 19, 20, 19},
+               {19, 20, 19, 19},
+               {20, 19, 20, 19},
+       },
+       {
+               {19, 19, 19, 20},
+               {20, 19, 20, 19},
+               {19, 20, 19, 20},
+               {20, 19, 20, 19},
+       },
+       {
+               {19, 20, 19, 20},
+               {20, 19, 20, 19},
+               {19, 20, 19, 20},
+               {20, 20, 20, 19},
+       },
+       {
+               {19, 20, 19, 20},
+               {20, 19, 20, 20},
+               {19, 20, 19, 20},
+               {20, 20, 20, 20},
+       },
+       {
+               {19, 20, 19, 20},
+               {20, 20, 20, 20},
+               {20, 20, 19, 20},
+               {20, 20, 20, 20},
+       },
+       {
+               {19, 20, 20, 20},
+               {20, 20, 20, 20},
+               {20, 20, 20, 20},
+               {20, 20, 20, 20},
+       },
+       {
+               {20, 20, 20, 20},
+               {20, 20, 20, 20},
+               {20, 20, 20, 20},
+               {21, 20, 20, 20},
+       },
+       {
+               {20, 20, 20, 20},
+               {20, 20, 21, 20},
+               {20, 20, 20, 20},
+               {21, 20, 21, 20},
+       },
+       {
+               {20, 20, 20, 20},
+               {21, 20, 21, 20},
+               {20, 21, 20, 20},
+               {21, 20, 21, 20},
+       },
+       {
+               {20, 20, 20, 21},
+               {21, 20, 21, 20},
+               {20, 21, 20, 21},
+               {21, 20, 21, 20},
+       },
+       {
+               {20, 21, 20, 21},
+               {21, 20, 21, 20},
+               {20, 21, 20, 21},
+               {21, 21, 21, 20},
+       },
+       {
+               {20, 21, 20, 21},
+               {21, 20, 21, 21},
+               {20, 21, 20, 21},
+               {21, 21, 21, 21},
+       },
+       {
+               {20, 21, 20, 21},
+               {21, 21, 21, 21},
+               {21, 21, 20, 21},
+               {21, 21, 21, 21},
+       },
+       {
+               {20, 21, 21, 21},
+               {21, 21, 21, 21},
+               {21, 21, 21, 21},
+               {21, 21, 21, 21},
+       },
+       {
+               {21, 21, 21, 21},
+               {21, 21, 21, 21},
+               {21, 21, 21, 21},
+               {22, 21, 21, 21},
+       },
+       {
+               {21, 21, 21, 21},
+               {21, 21, 22, 21},
+               {21, 21, 21, 21},
+               {22, 21, 22, 21},
+       },
+       {
+               {21, 21, 21, 21},
+               {22, 21, 22, 21},
+               {21, 22, 21, 21},
+               {22, 21, 22, 21},
+       },
+       {
+               {21, 21, 21, 22},
+               {22, 21, 22, 21},
+               {21, 22, 21, 21},
+               {22, 21, 22, 21},
+       },
+       {
+               {21, 22, 21, 22},
+               {22, 21, 22, 21},
+               {21, 22, 21, 22},
+               {22, 21, 22, 21},
+       },
+       {
+               {21, 22, 21, 22},
+               {22, 21, 22, 22},
+               {21, 22, 21, 22},
+               {22, 22, 22, 21},
+       },
+       {
+               {21, 22, 21, 22},
+               {22, 22, 22, 22},
+               {21, 22, 21, 22},
+               {22, 22, 22, 22},
+       },
+       {
+               {21, 22, 22, 22},
+               {22, 22, 22, 22},
+               {22, 22, 21, 22},
+               {22, 22, 22, 22},
+       },
+       {
+               {22, 22, 22, 22},
+               {22, 22, 22, 22},
+               {22, 22, 22, 22},
+               {22, 22, 22, 22},
+       },
+       {
+               {22, 22, 22, 22},
+               {22, 22, 23, 22},
+               {22, 22, 22, 22},
+               {23, 22, 22, 22},
+       },
+       {
+               {22, 22, 22, 22},
+               {23, 22, 23, 22},
+               {22, 22, 22, 22},
+               {23, 22, 23, 22},
+       },
+       {
+               {22, 22, 22, 23},
+               {23, 22, 23, 22},
+               {22, 23, 22, 22},
+               {23, 22, 23, 22},
+       },
+       {
+               {22, 23, 22, 23},
+               {23, 22, 23, 22},
+               {22, 23, 22, 23},
+               {23, 22, 23, 22},
+       },
+       {
+               {22, 23, 22, 23},
+               {23, 22, 23, 23},
+               {22, 23, 22, 23},
+               {23, 23, 23, 22},
+       },
+       {
+               {22, 23, 22, 23},
+               {23, 23, 23, 23},
+               {22, 23, 22, 23},
+               {23, 23, 23, 23},
+       },
+       {
+               {22, 23, 23, 23},
+               {23, 23, 23, 23},
+               {23, 23, 22, 23},
+               {23, 23, 23, 23},
+       },
+       {
+               {23, 23, 23, 23},
+               {23, 23, 23, 23},
+               {23, 23, 23, 23},
+               {23, 23, 23, 23},
+       },
+       {
+               {23, 23, 23, 23},
+               {23, 23, 24, 23},
+               {23, 23, 23, 23},
+               {24, 23, 23, 23},
+       },
+       {
+               {23, 23, 23, 23},
+               {24, 23, 24, 23},
+               {23, 23, 23, 23},
+               {24, 23, 24, 23},
+       },
+       {
+               {23, 23, 23, 23},
+               {24, 23, 24, 23},
+               {23, 24, 23, 23},
+               {24, 23, 24, 23},
+       },
+       {
+               {23, 23, 23, 24},
+               {24, 23, 24, 23},
+               {23, 24, 23, 24},
+               {24, 23, 24, 23},
+       },
+       {
+               {23, 24, 23, 24},
+               {24, 23, 24, 23},
+               {23, 24, 23, 24},
+               {24, 24, 24, 23},
+       },
+       {
+               {23, 24, 23, 24},
+               {24, 23, 24, 24},
+               {23, 24, 23, 24},
+               {24, 24, 24, 24},
+       },
+       {
+               {23, 24, 23, 24},
+               {24, 24, 24, 24},
+               {24, 24, 23, 24},
+               {24, 24, 24, 24},
+       },
+       {
+               {23, 24, 24, 24},
+               {24, 24, 24, 24},
+               {24, 24, 24, 24},
+               {24, 24, 24, 24},
+       },
+       {
+               {24, 24, 24, 24},
+               {24, 24, 24, 24},
+               {24, 24, 24, 24},
+               {25, 24, 24, 24},
+       },
+       {
+               {24, 24, 24, 24},
+               {24, 24, 25, 24},
+               {24, 24, 24, 24},
+               {25, 24, 25, 24},
+       },
+       {
+               {24, 24, 24, 24},
+               {25, 24, 25, 24},
+               {24, 25, 24, 24},
+               {25, 24, 25, 24},
+       },
+       {
+               {24, 24, 24, 25},
+               {25, 24, 25, 24},
+               {24, 25, 24, 25},
+               {25, 24, 25, 24},
+       },
+       {
+               {24, 25, 24, 25},
+               {25, 24, 25, 24},
+               {24, 25, 24, 25},
+               {25, 25, 25, 24},
+       },
+       {
+               {24, 25, 24, 25},
+               {25, 24, 25, 25},
+               {24, 25, 24, 25},
+               {25, 25, 25, 25},
+       },
+       {
+               {24, 25, 24, 25},
+               {25, 25, 25, 25},
+               {25, 25, 24, 25},
+               {25, 25, 25, 25},
+       },
+       {
+               {24, 25, 25, 25},
+               {25, 25, 25, 25},
+               {25, 25, 25, 25},
+               {25, 25, 25, 25},
+       },
+       {
+               {25, 25, 25, 25},
+               {25, 25, 25, 25},
+               {25, 25, 25, 25},
+               {26, 25, 25, 25},
+       },
+       {
+               {25, 25, 25, 25},
+               {25, 25, 26, 25},
+               {25, 25, 25, 25},
+               {26, 25, 26, 25},
+       },
+       {
+               {25, 25, 25, 25},
+               {26, 25, 26, 25},
+               {25, 25, 25, 25},
+               {26, 25, 26, 25},
+       },
+       {
+               {25, 25, 25, 26},
+               {26, 25, 26, 25},
+               {25, 26, 25, 25},
+               {26, 25, 26, 25},
+       },
+       {
+               {25, 26, 25, 26},
+               {26, 25, 26, 25},
+               {25, 26, 25, 26},
+               {26, 25, 26, 25},
+       },
+       {
+               {25, 26, 25, 26},
+               {26, 25, 26, 26},
+               {25, 26, 25, 26},
+               {26, 26, 26, 25},
+       },
+       {
+               {25, 26, 25, 26},
+               {26, 26, 26, 26},
+               {25, 26, 25, 26},
+               {26, 26, 26, 26},
+       },
+       {
+               {25, 26, 26, 26},
+               {26, 26, 26, 26},
+               {26, 26, 25, 26},
+               {26, 26, 26, 26},
+       },
+       {
+               {26, 26, 26, 26},
+               {26, 26, 26, 26},
+               {26, 26, 26, 26},
+               {26, 26, 26, 26},
+       },
+       {
+               {26, 26, 26, 26},
+               {26, 26, 27, 26},
+               {26, 26, 26, 26},
+               {27, 26, 26, 26},
+       },
+       {
+               {26, 26, 26, 26},
+               {27, 26, 27, 26},
+               {26, 26, 26, 26},
+               {27, 26, 27, 26},
+       },
+       {
+               {26, 26, 26, 27},
+               {27, 26, 27, 26},
+               {26, 27, 26, 26},
+               {27, 26, 27, 26},
+       },
+       {
+               {26, 27, 26, 27},
+               {27, 26, 27, 26},
+               {26, 27, 26, 27},
+               {27, 26, 27, 26},
+       },
+       {
+               {26, 27, 26, 27},
+               {27, 26, 27, 27},
+               {26, 27, 26, 27},
+               {27, 27, 27, 26},
+       },
+       {
+               {26, 27, 26, 27},
+               {27, 27, 27, 27},
+               {26, 27, 26, 27},
+               {27, 27, 27, 27},
+       },
+       {
+               {26, 27, 27, 27},
+               {27, 27, 27, 27},
+               {27, 27, 26, 27},
+               {27, 27, 27, 27},
+       },
+       {
+               {27, 27, 27, 27},
+               {27, 27, 27, 27},
+               {27, 27, 27, 27},
+               {27, 27, 27, 27},
+       },
+       {
+               {27, 27, 27, 27},
+               {27, 27, 28, 27},
+               {27, 27, 27, 27},
+               {28, 27, 27, 27},
+       },
+       {
+               {27, 27, 27, 27},
+               {27, 27, 28, 27},
+               {27, 27, 27, 27},
+               {28, 27, 28, 27},
+       },
+       {
+               {27, 27, 27, 27},
+               {28, 27, 28, 27},
+               {27, 28, 27, 27},
+               {28, 27, 28, 27},
+       },
+       {
+               {27, 27, 27, 28},
+               {28, 27, 28, 27},
+               {27, 28, 27, 28},
+               {28, 27, 28, 27},
+       },
+       {
+               {27, 28, 27, 28},
+               {28, 27, 28, 27},
+               {27, 28, 27, 28},
+               {28, 28, 28, 27},
+       },
+       {
+               {27, 28, 27, 28},
+               {28, 27, 28, 28},
+               {27, 28, 27, 28},
+               {28, 28, 28, 28},
+       },
+       {
+               {27, 28, 27, 28},
+               {28, 28, 28, 28},
+               {28, 28, 27, 28},
+               {28, 28, 28, 28},
+       },
+       {
+               {27, 28, 28, 28},
+               {28, 28, 28, 28},
+               {28, 28, 28, 28},
+               {28, 28, 28, 28},
+       },
+       {
+               {28, 28, 28, 28},
+               {28, 28, 28, 28},
+               {28, 28, 28, 28},
+               {29, 28, 28, 28},
+       },
+       {
+               {28, 28, 28, 28},
+               {28, 28, 29, 28},
+               {28, 28, 28, 28},
+               {29, 28, 29, 28},
+       },
+       {
+               {28, 28, 28, 28},
+               {29, 28, 29, 28},
+               {28, 29, 28, 28},
+               {29, 28, 29, 28},
+       },
+       {
+               {28, 28, 28, 29},
+               {29, 28, 29, 28},
+               {28, 29, 28, 29},
+               {29, 28, 29, 28},
+       },
+       {
+               {28, 29, 28, 29},
+               {29, 28, 29, 28},
+               {28, 29, 28, 29},
+               {29, 29, 29, 28},
+       },
+       {
+               {28, 29, 28, 29},
+               {29, 28, 29, 29},
+               {28, 29, 28, 29},
+               {29, 29, 29, 29},
+       },
+       {
+               {28, 29, 28, 29},
+               {29, 29, 29, 29},
+               {29, 29, 28, 29},
+               {29, 29, 29, 29},
+       },
+       {
+               {28, 29, 29, 29},
+               {29, 29, 29, 29},
+               {29, 29, 29, 29},
+               {29, 29, 29, 29},
+       },
+       {
+               {29, 29, 29, 29},
+               {29, 29, 29, 29},
+               {29, 29, 29, 29},
+               {30, 29, 29, 29},
+       },
+       {
+               {29, 29, 29, 29},
+               {29, 29, 30, 29},
+               {29, 29, 29, 29},
+               {30, 29, 29, 29},
+       },
+       {
+               {29, 29, 29, 29},
+               {30, 29, 30, 29},
+               {29, 29, 29, 29},
+               {30, 29, 30, 29},
+       },
+       {
+               {29, 29, 29, 30},
+               {30, 29, 30, 29},
+               {29, 30, 29, 29},
+               {30, 29, 30, 29},
+       },
+       {
+               {29, 30, 29, 30},
+               {30, 29, 30, 29},
+               {29, 30, 29, 30},
+               {30, 29, 30, 29},
+       },
+       {
+               {29, 30, 29, 30},
+               {30, 29, 30, 30},
+               {29, 30, 29, 30},
+               {30, 30, 30, 29},
+       },
+       {
+               {29, 30, 29, 30},
+               {30, 30, 30, 30},
+               {29, 30, 29, 30},
+               {30, 30, 30, 30},
+       },
+       {
+               {29, 30, 30, 30},
+               {30, 30, 30, 30},
+               {30, 30, 29, 30},
+               {30, 30, 30, 30},
+       },
+       {
+               {30, 30, 30, 30},
+               {30, 30, 30, 30},
+               {30, 30, 30, 30},
+               {30, 30, 30, 30},
+       },
+       {
+               {30, 30, 30, 30},
+               {30, 30, 31, 30},
+               {30, 30, 30, 30},
+               {31, 30, 30, 30},
+       },
+       {
+               {30, 30, 30, 30},
+               {31, 30, 31, 30},
+               {30, 30, 30, 30},
+               {31, 30, 31, 30},
+       },
+       {
+               {30, 30, 30, 31},
+               {31, 30, 31, 30},
+               {30, 31, 30, 30},
+               {31, 30, 31, 30},
+       },
+       {
+               {30, 31, 30, 31},
+               {31, 30, 31, 30},
+               {30, 31, 30, 31},
+               {31, 30, 31, 30},
+       },
+       {
+               {30, 31, 30, 31},
+               {31, 30, 31, 31},
+               {30, 31, 30, 31},
+               {31, 31, 31, 30},
+       },
+       {
+               {30, 31, 30, 31},
+               {31, 31, 31, 31},
+               {30, 31, 30, 31},
+               {31, 31, 31, 31},
+       },
+       {
+               {30, 31, 31, 31},
+               {31, 31, 31, 31},
+               {31, 31, 30, 31},
+               {31, 31, 31, 31},
+       },
+       {
+               {31, 31, 31, 31},
+               {31, 31, 31, 31},
+               {31, 31, 31, 31},
+               {31, 31, 31, 31},
+       },
+};
+
+static const uint8_t dither_g[256][4][4] =
+{
+       {
+               {0, 0, 0, 0},
+               {0, 0, 0, 0},
+               {0, 0, 0, 0},
+               {0, 0, 0, 0},
+       },
+       {
+               {0, 0, 0, 0},
+               {1, 0, 1, 0},
+               {0, 0, 0, 0},
+               {1, 0, 1, 0},
+       },
+       {
+               {0, 1, 0, 1},
+               {1, 0, 1, 0},
+               {0, 1, 0, 1},
+               {1, 0, 1, 0},
+       },
+       {
+               {0, 1, 0, 1},
+               {1, 1, 1, 1},
+               {0, 1, 0, 1},
+               {1, 1, 1, 1},
+       },
+       {
+               {1, 1, 1, 1},
+               {1, 1, 1, 1},
+               {1, 1, 1, 1},
+               {1, 1, 1, 1},
+       },
+       {
+               {1, 1, 1, 1},
+               {2, 1, 2, 1},
+               {1, 1, 1, 1},
+               {2, 1, 2, 1},
+       },
+       {
+               {1, 2, 1, 2},
+               {2, 1, 2, 1},
+               {1, 2, 1, 2},
+               {2, 1, 2, 1},
+       },
+       {
+               {1, 2, 1, 2},
+               {2, 2, 2, 2},
+               {1, 2, 1, 2},
+               {2, 2, 2, 2},
+       },
+       {
+               {2, 2, 2, 2},
+               {2, 2, 2, 2},
+               {2, 2, 2, 2},
+               {2, 2, 2, 2},
+       },
+       {
+               {2, 2, 2, 2},
+               {3, 2, 3, 2},
+               {2, 2, 2, 2},
+               {3, 2, 3, 2},
+       },
+       {
+               {2, 3, 2, 3},
+               {3, 2, 3, 2},
+               {2, 3, 2, 3},
+               {3, 2, 3, 2},
+       },
+       {
+               {2, 3, 2, 3},
+               {3, 3, 3, 3},
+               {2, 3, 2, 3},
+               {3, 3, 3, 3},
+       },
+       {
+               {3, 3, 3, 3},
+               {3, 3, 3, 3},
+               {3, 3, 3, 3},
+               {3, 3, 3, 3},
+       },
+       {
+               {3, 3, 3, 3},
+               {4, 3, 4, 3},
+               {3, 3, 3, 3},
+               {4, 3, 4, 3},
+       },
+       {
+               {3, 4, 3, 4},
+               {4, 3, 4, 3},
+               {3, 4, 3, 4},
+               {4, 3, 4, 3},
+       },
+       {
+               {3, 4, 3, 4},
+               {4, 4, 4, 4},
+               {3, 4, 3, 4},
+               {4, 4, 4, 4},
+       },
+       {
+               {3, 4, 4, 4},
+               {4, 4, 4, 4},
+               {4, 4, 4, 4},
+               {4, 4, 4, 4},
+       },
+       {
+               {4, 4, 4, 4},
+               {4, 4, 5, 4},
+               {4, 4, 4, 4},
+               {5, 4, 5, 4},
+       },
+       {
+               {4, 4, 4, 5},
+               {5, 4, 5, 4},
+               {4, 5, 4, 5},
+               {5, 4, 5, 4},
+       },
+       {
+               {4, 5, 4, 5},
+               {5, 4, 5, 5},
+               {4, 5, 4, 5},
+               {5, 5, 5, 5},
+       },
+       {
+               {4, 5, 5, 5},
+               {5, 5, 5, 5},
+               {5, 5, 5, 5},
+               {5, 5, 5, 5},
+       },
+       {
+               {5, 5, 5, 5},
+               {5, 5, 6, 5},
+               {5, 5, 5, 5},
+               {6, 5, 6, 5},
+       },
+       {
+               {5, 5, 5, 6},
+               {6, 5, 6, 5},
+               {5, 6, 5, 6},
+               {6, 5, 6, 5},
+       },
+       {
+               {5, 6, 5, 6},
+               {6, 5, 6, 6},
+               {5, 6, 5, 6},
+               {6, 6, 6, 6},
+       },
+       {
+               {5, 6, 6, 6},
+               {6, 6, 6, 6},
+               {6, 6, 6, 6},
+               {6, 6, 6, 6},
+       },
+       {
+               {6, 6, 6, 6},
+               {6, 6, 7, 6},
+               {6, 6, 6, 6},
+               {7, 6, 7, 6},
+       },
+       {
+               {6, 6, 6, 7},
+               {7, 6, 7, 6},
+               {6, 7, 6, 7},
+               {7, 6, 7, 6},
+       },
+       {
+               {6, 7, 6, 7},
+               {7, 6, 7, 7},
+               {6, 7, 6, 7},
+               {7, 7, 7, 7},
+       },
+       {
+               {6, 7, 7, 7},
+               {7, 7, 7, 7},
+               {7, 7, 7, 7},
+               {7, 7, 7, 7},
+       },
+       {
+               {7, 7, 7, 7},
+               {7, 7, 8, 7},
+               {7, 7, 7, 7},
+               {8, 7, 8, 7},
+       },
+       {
+               {7, 7, 7, 8},
+               {8, 7, 8, 7},
+               {7, 8, 7, 8},
+               {8, 7, 8, 7},
+       },
+       {
+               {7, 8, 7, 8},
+               {8, 7, 8, 8},
+               {7, 8, 7, 8},
+               {8, 8, 8, 8},
+       },
+       {
+               {7, 8, 8, 8},
+               {8, 8, 8, 8},
+               {8, 8, 7, 8},
+               {8, 8, 8, 8},
+       },
+       {
+               {8, 8, 8, 8},
+               {8, 8, 9, 8},
+               {8, 8, 8, 8},
+               {9, 8, 8, 8},
+       },
+       {
+               {8, 8, 8, 9},
+               {9, 8, 9, 8},
+               {8, 9, 8, 8},
+               {9, 8, 9, 8},
+       },
+       {
+               {8, 9, 8, 9},
+               {9, 8, 9, 9},
+               {8, 9, 8, 9},
+               {9, 9, 9, 8},
+       },
+       {
+               {8, 9, 9, 9},
+               {9, 9, 9, 9},
+               {9, 9, 8, 9},
+               {9, 9, 9, 9},
+       },
+       {
+               {9, 9, 9, 9},
+               {9, 9, 10, 9},
+               {9, 9, 9, 9},
+               {10, 9, 9, 9},
+       },
+       {
+               {9, 9, 9, 10},
+               {10, 9, 10, 9},
+               {9, 10, 9, 9},
+               {10, 9, 10, 9},
+       },
+       {
+               {9, 10, 9, 10},
+               {10, 9, 10, 10},
+               {9, 10, 9, 10},
+               {10, 10, 10, 9},
+       },
+       {
+               {9, 10, 10, 10},
+               {10, 10, 10, 10},
+               {10, 10, 9, 10},
+               {10, 10, 10, 10},
+       },
+       {
+               {10, 10, 10, 10},
+               {10, 10, 11, 10},
+               {10, 10, 10, 10},
+               {11, 10, 10, 10},
+       },
+       {
+               {10, 10, 10, 11},
+               {11, 10, 11, 10},
+               {10, 11, 10, 10},
+               {11, 10, 11, 10},
+       },
+       {
+               {10, 11, 10, 11},
+               {11, 10, 11, 11},
+               {10, 11, 10, 11},
+               {11, 11, 11, 10},
+       },
+       {
+               {10, 11, 11, 11},
+               {11, 11, 11, 11},
+               {11, 11, 10, 11},
+               {11, 11, 11, 11},
+       },
+       {
+               {11, 11, 11, 11},
+               {11, 11, 12, 11},
+               {11, 11, 11, 11},
+               {12, 11, 11, 11},
+       },
+       {
+               {11, 11, 11, 12},
+               {12, 11, 12, 11},
+               {11, 12, 11, 11},
+               {12, 11, 12, 11},
+       },
+       {
+               {11, 12, 11, 12},
+               {12, 11, 12, 12},
+               {11, 12, 11, 12},
+               {12, 12, 12, 11},
+       },
+       {
+               {11, 12, 11, 12},
+               {12, 12, 12, 12},
+               {12, 12, 11, 12},
+               {12, 12, 12, 12},
+       },
+       {
+               {12, 12, 12, 12},
+               {12, 12, 12, 12},
+               {12, 12, 12, 12},
+               {13, 12, 12, 12},
+       },
+       {
+               {12, 12, 12, 12},
+               {13, 12, 13, 12},
+               {12, 13, 12, 12},
+               {13, 12, 13, 12},
+       },
+       {
+               {12, 13, 12, 13},
+               {13, 12, 13, 12},
+               {12, 13, 12, 13},
+               {13, 13, 13, 12},
+       },
+       {
+               {12, 13, 12, 13},
+               {13, 13, 13, 13},
+               {13, 13, 12, 13},
+               {13, 13, 13, 13},
+       },
+       {
+               {13, 13, 13, 13},
+               {13, 13, 13, 13},
+               {13, 13, 13, 13},
+               {14, 13, 13, 13},
+       },
+       {
+               {13, 13, 13, 13},
+               {14, 13, 14, 13},
+               {13, 14, 13, 13},
+               {14, 13, 14, 13},
+       },
+       {
+               {13, 14, 13, 14},
+               {14, 13, 14, 13},
+               {13, 14, 13, 14},
+               {14, 14, 14, 13},
+       },
+       {
+               {13, 14, 13, 14},
+               {14, 14, 14, 14},
+               {14, 14, 13, 14},
+               {14, 14, 14, 14},
+       },
+       {
+               {14, 14, 14, 14},
+               {14, 14, 14, 14},
+               {14, 14, 14, 14},
+               {15, 14, 14, 14},
+       },
+       {
+               {14, 14, 14, 14},
+               {15, 14, 15, 14},
+               {14, 15, 14, 14},
+               {15, 14, 15, 14},
+       },
+       {
+               {14, 15, 14, 15},
+               {15, 14, 15, 14},
+               {14, 15, 14, 15},
+               {15, 15, 15, 14},
+       },
+       {
+               {14, 15, 14, 15},
+               {15, 15, 15, 15},
+               {15, 15, 14, 15},
+               {15, 15, 15, 15},
+       },
+       {
+               {15, 15, 15, 15},
+               {15, 15, 15, 15},
+               {15, 15, 15, 15},
+               {16, 15, 15, 15},
+       },
+       {
+               {15, 15, 15, 15},
+               {16, 15, 16, 15},
+               {15, 16, 15, 15},
+               {16, 15, 16, 15},
+       },
+       {
+               {15, 16, 15, 16},
+               {16, 15, 16, 15},
+               {15, 16, 15, 16},
+               {16, 16, 16, 15},
+       },
+       {
+               {15, 16, 15, 16},
+               {16, 16, 16, 16},
+               {16, 16, 15, 16},
+               {16, 16, 16, 16},
+       },
+       {
+               {16, 16, 16, 16},
+               {16, 16, 16, 16},
+               {16, 16, 16, 16},
+               {17, 16, 16, 16},
+       },
+       {
+               {16, 16, 16, 16},
+               {17, 16, 17, 16},
+               {16, 17, 16, 16},
+               {17, 16, 17, 16},
+       },
+       {
+               {16, 17, 16, 17},
+               {17, 16, 17, 16},
+               {16, 17, 16, 17},
+               {17, 17, 17, 16},
+       },
+       {
+               {16, 17, 16, 17},
+               {17, 17, 17, 17},
+               {17, 17, 16, 17},
+               {17, 17, 17, 17},
+       },
+       {
+               {17, 17, 17, 17},
+               {17, 17, 17, 17},
+               {17, 17, 17, 17},
+               {18, 17, 17, 17},
+       },
+       {
+               {17, 17, 17, 17},
+               {18, 17, 18, 17},
+               {17, 18, 17, 17},
+               {18, 17, 18, 17},
+       },
+       {
+               {17, 18, 17, 18},
+               {18, 17, 18, 17},
+               {17, 18, 17, 18},
+               {18, 18, 18, 17},
+       },
+       {
+               {17, 18, 17, 18},
+               {18, 18, 18, 18},
+               {18, 18, 17, 18},
+               {18, 18, 18, 18},
+       },
+       {
+               {18, 18, 18, 18},
+               {18, 18, 18, 18},
+               {18, 18, 18, 18},
+               {19, 18, 18, 18},
+       },
+       {
+               {18, 18, 18, 18},
+               {19, 18, 19, 18},
+               {18, 19, 18, 18},
+               {19, 18, 19, 18},
+       },
+       {
+               {18, 19, 18, 19},
+               {19, 18, 19, 18},
+               {18, 19, 18, 19},
+               {19, 19, 19, 18},
+       },
+       {
+               {18, 19, 18, 19},
+               {19, 19, 19, 19},
+               {19, 19, 18, 19},
+               {19, 19, 19, 19},
+       },
+       {
+               {19, 19, 19, 19},
+               {19, 19, 19, 19},
+               {19, 19, 19, 19},
+               {20, 19, 19, 19},
+       },
+       {
+               {19, 19, 19, 19},
+               {20, 19, 20, 19},
+               {19, 20, 19, 19},
+               {20, 19, 20, 19},
+       },
+       {
+               {19, 20, 19, 20},
+               {20, 19, 20, 19},
+               {19, 20, 19, 20},
+               {20, 20, 20, 19},
+       },
+       {
+               {19, 20, 19, 20},
+               {20, 20, 20, 20},
+               {19, 20, 19, 20},
+               {20, 20, 20, 20},
+       },
+       {
+               {20, 20, 20, 20},
+               {20, 20, 20, 20},
+               {20, 20, 20, 20},
+               {20, 20, 20, 20},
+       },
+       {
+               {20, 20, 20, 20},
+               {21, 20, 21, 20},
+               {20, 20, 20, 20},
+               {21, 20, 21, 20},
+       },
+       {
+               {20, 21, 20, 21},
+               {21, 20, 21, 20},
+               {20, 21, 20, 21},
+               {21, 20, 21, 20},
+       },
+       {
+               {20, 21, 20, 21},
+               {21, 21, 21, 21},
+               {20, 21, 20, 21},
+               {21, 21, 21, 21},
+       },
+       {
+               {21, 21, 21, 21},
+               {21, 21, 21, 21},
+               {21, 21, 21, 21},
+               {21, 21, 21, 21},
+       },
+       {
+               {21, 21, 21, 21},
+               {22, 21, 22, 21},
+               {21, 21, 21, 21},
+               {22, 21, 22, 21},
+       },
+       {
+               {21, 22, 21, 22},
+               {22, 21, 22, 21},
+               {21, 22, 21, 22},
+               {22, 21, 22, 21},
+       },
+       {
+               {21, 22, 21, 22},
+               {22, 22, 22, 22},
+               {21, 22, 21, 22},
+               {22, 22, 22, 22},
+       },
+       {
+               {22, 22, 22, 22},
+               {22, 22, 22, 22},
+               {22, 22, 22, 22},
+               {22, 22, 22, 22},
+       },
+       {
+               {22, 22, 22, 22},
+               {23, 22, 23, 22},
+               {22, 22, 22, 22},
+               {23, 22, 23, 22},
+       },
+       {
+               {22, 23, 22, 23},
+               {23, 22, 23, 22},
+               {22, 23, 22, 23},
+               {23, 22, 23, 22},
+       },
+       {
+               {22, 23, 22, 23},
+               {23, 23, 23, 23},
+               {22, 23, 22, 23},
+               {23, 23, 23, 23},
+       },
+       {
+               {23, 23, 23, 23},
+               {23, 23, 23, 23},
+               {23, 23, 23, 23},
+               {23, 23, 23, 23},
+       },
+       {
+               {23, 23, 23, 23},
+               {24, 23, 24, 23},
+               {23, 23, 23, 23},
+               {24, 23, 24, 23},
+       },
+       {
+               {23, 24, 23, 24},
+               {24, 23, 24, 23},
+               {23, 24, 23, 24},
+               {24, 23, 24, 23},
+       },
+       {
+               {23, 24, 23, 24},
+               {24, 23, 24, 24},
+               {23, 24, 23, 24},
+               {24, 24, 24, 24},
+       },
+       {
+               {23, 24, 24, 24},
+               {24, 24, 24, 24},
+               {24, 24, 24, 24},
+               {24, 24, 24, 24},
+       },
+       {
+               {24, 24, 24, 24},
+               {24, 24, 25, 24},
+               {24, 24, 24, 24},
+               {25, 24, 25, 24},
+       },
+       {
+               {24, 24, 24, 25},
+               {25, 24, 25, 24},
+               {24, 25, 24, 25},
+               {25, 24, 25, 24},
+       },
+       {
+               {24, 25, 24, 25},
+               {25, 24, 25, 25},
+               {24, 25, 24, 25},
+               {25, 25, 25, 25},
+       },
+       {
+               {24, 25, 25, 25},
+               {25, 25, 25, 25},
+               {25, 25, 25, 25},
+               {25, 25, 25, 25},
+       },
+       {
+               {25, 25, 25, 25},
+               {25, 25, 26, 25},
+               {25, 25, 25, 25},
+               {26, 25, 26, 25},
+       },
+       {
+               {25, 25, 25, 26},
+               {26, 25, 26, 25},
+               {25, 26, 25, 26},
+               {26, 25, 26, 25},
+       },
+       {
+               {25, 26, 25, 26},
+               {26, 25, 26, 26},
+               {25, 26, 25, 26},
+               {26, 26, 26, 26},
+       },
+       {
+               {25, 26, 26, 26},
+               {26, 26, 26, 26},
+               {26, 26, 26, 26},
+               {26, 26, 26, 26},
+       },
+       {
+               {26, 26, 26, 26},
+               {26, 26, 27, 26},
+               {26, 26, 26, 26},
+               {27, 26, 27, 26},
+       },
+       {
+               {26, 26, 26, 27},
+               {27, 26, 27, 26},
+               {26, 27, 26, 27},
+               {27, 26, 27, 26},
+       },
+       {
+               {26, 27, 26, 27},
+               {27, 26, 27, 27},
+               {26, 27, 26, 27},
+               {27, 27, 27, 27},
+       },
+       {
+               {26, 27, 27, 27},
+               {27, 27, 27, 27},
+               {27, 27, 27, 27},
+               {27, 27, 27, 27},
+       },
+       {
+               {27, 27, 27, 27},
+               {27, 27, 28, 27},
+               {27, 27, 27, 27},
+               {28, 27, 28, 27},
+       },
+       {
+               {27, 27, 27, 28},
+               {28, 27, 28, 27},
+               {27, 28, 27, 28},
+               {28, 27, 28, 27},
+       },
+       {
+               {27, 28, 27, 28},
+               {28, 27, 28, 28},
+               {27, 28, 27, 28},
+               {28, 28, 28, 27},
+       },
+       {
+               {27, 28, 28, 28},
+               {28, 28, 28, 28},
+               {28, 28, 27, 28},
+               {28, 28, 28, 28},
+       },
+       {
+               {28, 28, 28, 28},
+               {28, 28, 29, 28},
+               {28, 28, 28, 28},
+               {29, 28, 28, 28},
+       },
+       {
+               {28, 28, 28, 29},
+               {29, 28, 29, 28},
+               {28, 29, 28, 28},
+               {29, 28, 29, 28},
+       },
+       {
+               {28, 29, 28, 29},
+               {29, 28, 29, 29},
+               {28, 29, 28, 29},
+               {29, 29, 29, 28},
+       },
+       {
+               {28, 29, 29, 29},
+               {29, 29, 29, 29},
+               {29, 29, 28, 29},
+               {29, 29, 29, 29},
+       },
+       {
+               {29, 29, 29, 29},
+               {29, 29, 30, 29},
+               {29, 29, 29, 29},
+               {30, 29, 29, 29},
+       },
+       {
+               {29, 29, 29, 30},
+               {30, 29, 30, 29},
+               {29, 30, 29, 29},
+               {30, 29, 30, 29},
+       },
+       {
+               {29, 30, 29, 30},
+               {30, 29, 30, 30},
+               {29, 30, 29, 30},
+               {30, 30, 30, 29},
+       },
+       {
+               {29, 30, 30, 30},
+               {30, 30, 30, 30},
+               {30, 30, 29, 30},
+               {30, 30, 30, 30},
+       },
+       {
+               {30, 30, 30, 30},
+               {30, 30, 31, 30},
+               {30, 30, 30, 30},
+               {31, 30, 30, 30},
+       },
+       {
+               {30, 30, 30, 31},
+               {31, 30, 31, 30},
+               {30, 31, 30, 30},
+               {31, 30, 31, 30},
+       },
+       {
+               {30, 31, 30, 31},
+               {31, 30, 31, 31},
+               {30, 31, 30, 31},
+               {31, 31, 31, 30},
+       },
+       {
+               {30, 31, 31, 31},
+               {31, 31, 31, 31},
+               {31, 31, 30, 31},
+               {31, 31, 31, 31},
+       },
+       {
+               {31, 31, 31, 31},
+               {31, 31, 32, 31},
+               {31, 31, 31, 31},
+               {32, 31, 31, 31},
+       },
+       {
+               {31, 31, 31, 32},
+               {32, 31, 32, 31},
+               {31, 32, 31, 31},
+               {32, 31, 32, 31},
+       },
+       {
+               {31, 32, 31, 32},
+               {32, 31, 32, 32},
+               {31, 32, 31, 32},
+               {32, 32, 32, 31},
+       },
+       {
+               {31, 32, 32, 32},
+               {32, 32, 32, 32},
+               {32, 32, 31, 32},
+               {32, 32, 32, 32},
+       },
+       {
+               {32, 32, 32, 32},
+               {32, 32, 33, 32},
+               {32, 32, 32, 32},
+               {33, 32, 32, 32},
+       },
+       {
+               {32, 32, 32, 33},
+               {33, 32, 33, 32},
+               {32, 33, 32, 32},
+               {33, 32, 33, 32},
+       },
+       {
+               {32, 33, 32, 33},
+               {33, 32, 33, 33},
+               {32, 33, 32, 33},
+               {33, 33, 33, 32},
+       },
+       {
+               {32, 33, 33, 33},
+               {33, 33, 33, 33},
+               {33, 33, 32, 33},
+               {33, 33, 33, 33},
+       },
+       {
+               {33, 33, 33, 33},
+               {33, 33, 34, 33},
+               {33, 33, 33, 33},
+               {34, 33, 33, 33},
+       },
+       {
+               {33, 33, 33, 34},
+               {34, 33, 34, 33},
+               {33, 34, 33, 33},
+               {34, 33, 34, 33},
+       },
+       {
+               {33, 34, 33, 34},
+               {34, 33, 34, 34},
+               {33, 34, 33, 34},
+               {34, 34, 34, 33},
+       },
+       {
+               {33, 34, 34, 34},
+               {34, 34, 34, 34},
+               {34, 34, 33, 34},
+               {34, 34, 34, 34},
+       },
+       {
+               {34, 34, 34, 34},
+               {34, 34, 35, 34},
+               {34, 34, 34, 34},
+               {35, 34, 34, 34},
+       },
+       {
+               {34, 34, 34, 35},
+               {35, 34, 35, 34},
+               {34, 35, 34, 34},
+               {35, 34, 35, 34},
+       },
+       {
+               {34, 35, 34, 35},
+               {35, 34, 35, 35},
+               {34, 35, 34, 35},
+               {35, 35, 35, 34},
+       },
+       {
+               {34, 35, 35, 35},
+               {35, 35, 35, 35},
+               {35, 35, 34, 35},
+               {35, 35, 35, 35},
+       },
+       {
+               {35, 35, 35, 35},
+               {35, 35, 36, 35},
+               {35, 35, 35, 35},
+               {36, 35, 35, 35},
+       },
+       {
+               {35, 35, 35, 36},
+               {36, 35, 36, 35},
+               {35, 36, 35, 35},
+               {36, 35, 36, 35},
+       },
+       {
+               {35, 36, 35, 36},
+               {36, 35, 36, 35},
+               {35, 36, 35, 36},
+               {36, 36, 36, 35},
+       },
+       {
+               {35, 36, 35, 36},
+               {36, 36, 36, 36},
+               {36, 36, 35, 36},
+               {36, 36, 36, 36},
+       },
+       {
+               {36, 36, 36, 36},
+               {36, 36, 36, 36},
+               {36, 36, 36, 36},
+               {37, 36, 36, 36},
+       },
+       {
+               {36, 36, 36, 36},
+               {37, 36, 37, 36},
+               {36, 37, 36, 36},
+               {37, 36, 37, 36},
+       },
+       {
+               {36, 37, 36, 37},
+               {37, 36, 37, 36},
+               {36, 37, 36, 37},
+               {37, 37, 37, 36},
+       },
+       {
+               {36, 37, 36, 37},
+               {37, 37, 37, 37},
+               {37, 37, 36, 37},
+               {37, 37, 37, 37},
+       },
+       {
+               {37, 37, 37, 37},
+               {37, 37, 37, 37},
+               {37, 37, 37, 37},
+               {38, 37, 37, 37},
+       },
+       {
+               {37, 37, 37, 37},
+               {38, 37, 38, 37},
+               {37, 38, 37, 37},
+               {38, 37, 38, 37},
+       },
+       {
+               {37, 38, 37, 38},
+               {38, 37, 38, 37},
+               {37, 38, 37, 38},
+               {38, 38, 38, 37},
+       },
+       {
+               {37, 38, 37, 38},
+               {38, 38, 38, 38},
+               {38, 38, 37, 38},
+               {38, 38, 38, 38},
+       },
+       {
+               {38, 38, 38, 38},
+               {38, 38, 38, 38},
+               {38, 38, 38, 38},
+               {39, 38, 38, 38},
+       },
+       {
+               {38, 38, 38, 38},
+               {39, 38, 39, 38},
+               {38, 39, 38, 38},
+               {39, 38, 39, 38},
+       },
+       {
+               {38, 39, 38, 39},
+               {39, 38, 39, 38},
+               {38, 39, 38, 39},
+               {39, 39, 39, 38},
+       },
+       {
+               {38, 39, 38, 39},
+               {39, 39, 39, 39},
+               {39, 39, 38, 39},
+               {39, 39, 39, 39},
+       },
+       {
+               {39, 39, 39, 39},
+               {39, 39, 39, 39},
+               {39, 39, 39, 39},
+               {40, 39, 39, 39},
+       },
+       {
+               {39, 39, 39, 39},
+               {40, 39, 40, 39},
+               {39, 40, 39, 39},
+               {40, 39, 40, 39},
+       },
+       {
+               {39, 40, 39, 40},
+               {40, 39, 40, 39},
+               {39, 40, 39, 40},
+               {40, 39, 40, 39},
+       },
+       {
+               {39, 40, 39, 40},
+               {40, 40, 40, 40},
+               {39, 40, 39, 40},
+               {40, 40, 40, 40},
+       },
+       {
+               {40, 40, 40, 40},
+               {40, 40, 40, 40},
+               {40, 40, 40, 40},
+               {40, 40, 40, 40},
+       },
+       {
+               {40, 40, 40, 40},
+               {41, 40, 41, 40},
+               {40, 40, 40, 40},
+               {41, 40, 41, 40},
+       },
+       {
+               {40, 41, 40, 41},
+               {41, 40, 41, 40},
+               {40, 41, 40, 41},
+               {41, 40, 41, 40},
+       },
+       {
+               {40, 41, 40, 41},
+               {41, 41, 41, 41},
+               {40, 41, 40, 41},
+               {41, 41, 41, 41},
+       },
+       {
+               {41, 41, 41, 41},
+               {41, 41, 41, 41},
+               {41, 41, 41, 41},
+               {41, 41, 41, 41},
+       },
+       {
+               {41, 41, 41, 41},
+               {42, 41, 42, 41},
+               {41, 41, 41, 41},
+               {42, 41, 42, 41},
+       },
+       {
+               {41, 42, 41, 42},
+               {42, 41, 42, 41},
+               {41, 42, 41, 42},
+               {42, 41, 42, 41},
+       },
+       {
+               {41, 42, 41, 42},
+               {42, 42, 42, 42},
+               {41, 42, 41, 42},
+               {42, 42, 42, 42},
+       },
+       {
+               {42, 42, 42, 42},
+               {42, 42, 42, 42},
+               {42, 42, 42, 42},
+               {42, 42, 42, 42},
+       },
+       {
+               {42, 42, 42, 42},
+               {43, 42, 43, 42},
+               {42, 42, 42, 42},
+               {43, 42, 43, 42},
+       },
+       {
+               {42, 43, 42, 43},
+               {43, 42, 43, 42},
+               {42, 43, 42, 43},
+               {43, 42, 43, 42},
+       },
+       {
+               {42, 43, 42, 43},
+               {43, 43, 43, 43},
+               {42, 43, 42, 43},
+               {43, 43, 43, 43},
+       },
+       {
+               {43, 43, 43, 43},
+               {43, 43, 43, 43},
+               {43, 43, 43, 43},
+               {43, 43, 43, 43},
+       },
+       {
+               {43, 43, 43, 43},
+               {44, 43, 44, 43},
+               {43, 43, 43, 43},
+               {44, 43, 44, 43},
+       },
+       {
+               {43, 43, 43, 44},
+               {44, 43, 44, 43},
+               {43, 44, 43, 44},
+               {44, 43, 44, 43},
+       },
+       {
+               {43, 44, 43, 44},
+               {44, 43, 44, 44},
+               {43, 44, 43, 44},
+               {44, 44, 44, 44},
+       },
+       {
+               {43, 44, 44, 44},
+               {44, 44, 44, 44},
+               {44, 44, 44, 44},
+               {44, 44, 44, 44},
+       },
+       {
+               {44, 44, 44, 44},
+               {44, 44, 45, 44},
+               {44, 44, 44, 44},
+               {45, 44, 45, 44},
+       },
+       {
+               {44, 44, 44, 45},
+               {45, 44, 45, 44},
+               {44, 45, 44, 45},
+               {45, 44, 45, 44},
+       },
+       {
+               {44, 45, 44, 45},
+               {45, 44, 45, 45},
+               {44, 45, 44, 45},
+               {45, 45, 45, 45},
+       },
+       {
+               {44, 45, 45, 45},
+               {45, 45, 45, 45},
+               {45, 45, 45, 45},
+               {45, 45, 45, 45},
+       },
+       {
+               {45, 45, 45, 45},
+               {45, 45, 46, 45},
+               {45, 45, 45, 45},
+               {46, 45, 46, 45},
+       },
+       {
+               {45, 45, 45, 46},
+               {46, 45, 46, 45},
+               {45, 46, 45, 46},
+               {46, 45, 46, 45},
+       },
+       {
+               {45, 46, 45, 46},
+               {46, 45, 46, 46},
+               {45, 46, 45, 46},
+               {46, 46, 46, 46},
+       },
+       {
+               {45, 46, 46, 46},
+               {46, 46, 46, 46},
+               {46, 46, 46, 46},
+               {46, 46, 46, 46},
+       },
+       {
+               {46, 46, 46, 46},
+               {46, 46, 47, 46},
+               {46, 46, 46, 46},
+               {47, 46, 47, 46},
+       },
+       {
+               {46, 46, 46, 47},
+               {47, 46, 47, 46},
+               {46, 47, 46, 47},
+               {47, 46, 47, 46},
+       },
+       {
+               {46, 47, 46, 47},
+               {47, 46, 47, 47},
+               {46, 47, 46, 47},
+               {47, 47, 47, 47},
+       },
+       {
+               {46, 47, 47, 47},
+               {47, 47, 47, 47},
+               {47, 47, 47, 47},
+               {47, 47, 47, 47},
+       },
+       {
+               {47, 47, 47, 47},
+               {47, 47, 48, 47},
+               {47, 47, 47, 47},
+               {48, 47, 48, 47},
+       },
+       {
+               {47, 47, 47, 48},
+               {48, 47, 48, 47},
+               {47, 48, 47, 48},
+               {48, 47, 48, 47},
+       },
+       {
+               {47, 48, 47, 48},
+               {48, 47, 48, 48},
+               {47, 48, 47, 48},
+               {48, 48, 48, 48},
+       },
+       {
+               {47, 48, 48, 48},
+               {48, 48, 48, 48},
+               {48, 48, 48, 48},
+               {48, 48, 48, 48},
+       },
+       {
+               {48, 48, 48, 48},
+               {48, 48, 49, 48},
+               {48, 48, 48, 48},
+               {49, 48, 49, 48},
+       },
+       {
+               {48, 48, 48, 49},
+               {49, 48, 49, 48},
+               {48, 49, 48, 49},
+               {49, 48, 49, 48},
+       },
+       {
+               {48, 49, 48, 49},
+               {49, 48, 49, 49},
+               {48, 49, 48, 49},
+               {49, 49, 49, 49},
+       },
+       {
+               {48, 49, 49, 49},
+               {49, 49, 49, 49},
+               {49, 49, 49, 49},
+               {49, 49, 49, 49},
+       },
+       {
+               {49, 49, 49, 49},
+               {49, 49, 50, 49},
+               {49, 49, 49, 49},
+               {50, 49, 50, 49},
+       },
+       {
+               {49, 49, 49, 50},
+               {50, 49, 50, 49},
+               {49, 50, 49, 50},
+               {50, 49, 50, 49},
+       },
+       {
+               {49, 50, 49, 50},
+               {50, 49, 50, 50},
+               {49, 50, 49, 50},
+               {50, 50, 50, 50},
+       },
+       {
+               {49, 50, 50, 50},
+               {50, 50, 50, 50},
+               {50, 50, 50, 50},
+               {50, 50, 50, 50},
+       },
+       {
+               {50, 50, 50, 50},
+               {50, 50, 51, 50},
+               {50, 50, 50, 50},
+               {51, 50, 51, 50},
+       },
+       {
+               {50, 50, 50, 51},
+               {51, 50, 51, 50},
+               {50, 51, 50, 51},
+               {51, 50, 51, 50},
+       },
+       {
+               {50, 51, 50, 51},
+               {51, 50, 51, 51},
+               {50, 51, 50, 51},
+               {51, 51, 51, 51},
+       },
+       {
+               {50, 51, 51, 51},
+               {51, 51, 51, 51},
+               {51, 51, 51, 51},
+               {51, 51, 51, 51},
+       },
+       {
+               {51, 51, 51, 51},
+               {51, 51, 52, 51},
+               {51, 51, 51, 51},
+               {52, 51, 52, 51},
+       },
+       {
+               {51, 51, 51, 52},
+               {52, 51, 52, 51},
+               {51, 52, 51, 51},
+               {52, 51, 52, 51},
+       },
+       {
+               {51, 52, 51, 52},
+               {52, 51, 52, 52},
+               {51, 52, 51, 52},
+               {52, 52, 52, 51},
+       },
+       {
+               {51, 52, 52, 52},
+               {52, 52, 52, 52},
+               {52, 52, 51, 52},
+               {52, 52, 52, 52},
+       },
+       {
+               {52, 52, 52, 52},
+               {52, 52, 53, 52},
+               {52, 52, 52, 52},
+               {53, 52, 52, 52},
+       },
+       {
+               {52, 52, 52, 53},
+               {53, 52, 53, 52},
+               {52, 53, 52, 52},
+               {53, 52, 53, 52},
+       },
+       {
+               {52, 53, 52, 53},
+               {53, 52, 53, 53},
+               {52, 53, 52, 53},
+               {53, 53, 53, 52},
+       },
+       {
+               {52, 53, 53, 53},
+               {53, 53, 53, 53},
+               {53, 53, 52, 53},
+               {53, 53, 53, 53},
+       },
+       {
+               {53, 53, 53, 53},
+               {53, 53, 54, 53},
+               {53, 53, 53, 53},
+               {54, 53, 53, 53},
+       },
+       {
+               {53, 53, 53, 54},
+               {54, 53, 54, 53},
+               {53, 54, 53, 53},
+               {54, 53, 54, 53},
+       },
+       {
+               {53, 54, 53, 54},
+               {54, 53, 54, 54},
+               {53, 54, 53, 54},
+               {54, 54, 54, 53},
+       },
+       {
+               {53, 54, 54, 54},
+               {54, 54, 54, 54},
+               {54, 54, 53, 54},
+               {54, 54, 54, 54},
+       },
+       {
+               {54, 54, 54, 54},
+               {54, 54, 55, 54},
+               {54, 54, 54, 54},
+               {55, 54, 54, 54},
+       },
+       {
+               {54, 54, 54, 55},
+               {55, 54, 55, 54},
+               {54, 55, 54, 54},
+               {55, 54, 55, 54},
+       },
+       {
+               {54, 55, 54, 55},
+               {55, 54, 55, 55},
+               {54, 55, 54, 55},
+               {55, 55, 55, 54},
+       },
+       {
+               {54, 55, 55, 55},
+               {55, 55, 55, 55},
+               {55, 55, 54, 55},
+               {55, 55, 55, 55},
+       },
+       {
+               {55, 55, 55, 55},
+               {55, 55, 56, 55},
+               {55, 55, 55, 55},
+               {56, 55, 55, 55},
+       },
+       {
+               {55, 55, 55, 55},
+               {56, 55, 56, 55},
+               {55, 56, 55, 55},
+               {56, 55, 56, 55},
+       },
+       {
+               {55, 56, 55, 56},
+               {56, 55, 56, 55},
+               {55, 56, 55, 56},
+               {56, 56, 56, 55},
+       },
+       {
+               {55, 56, 55, 56},
+               {56, 56, 56, 56},
+               {56, 56, 55, 56},
+               {56, 56, 56, 56},
+       },
+       {
+               {56, 56, 56, 56},
+               {56, 56, 56, 56},
+               {56, 56, 56, 56},
+               {57, 56, 56, 56},
+       },
+       {
+               {56, 56, 56, 56},
+               {57, 56, 57, 56},
+               {56, 57, 56, 56},
+               {57, 56, 57, 56},
+       },
+       {
+               {56, 57, 56, 57},
+               {57, 56, 57, 56},
+               {56, 57, 56, 57},
+               {57, 57, 57, 56},
+       },
+       {
+               {56, 57, 56, 57},
+               {57, 57, 57, 57},
+               {57, 57, 56, 57},
+               {57, 57, 57, 57},
+       },
+       {
+               {57, 57, 57, 57},
+               {57, 57, 57, 57},
+               {57, 57, 57, 57},
+               {58, 57, 57, 57},
+       },
+       {
+               {57, 57, 57, 57},
+               {58, 57, 58, 57},
+               {57, 58, 57, 57},
+               {58, 57, 58, 57},
+       },
+       {
+               {57, 58, 57, 58},
+               {58, 57, 58, 57},
+               {57, 58, 57, 58},
+               {58, 58, 58, 57},
+       },
+       {
+               {57, 58, 57, 58},
+               {58, 58, 58, 58},
+               {58, 58, 57, 58},
+               {58, 58, 58, 58},
+       },
+       {
+               {58, 58, 58, 58},
+               {58, 58, 58, 58},
+               {58, 58, 58, 58},
+               {59, 58, 58, 58},
+       },
+       {
+               {58, 58, 58, 58},
+               {59, 58, 59, 58},
+               {58, 59, 58, 58},
+               {59, 58, 59, 58},
+       },
+       {
+               {58, 59, 58, 59},
+               {59, 58, 59, 58},
+               {58, 59, 58, 59},
+               {59, 59, 59, 58},
+       },
+       {
+               {58, 59, 58, 59},
+               {59, 59, 59, 59},
+               {59, 59, 58, 59},
+               {59, 59, 59, 59},
+       },
+       {
+               {59, 59, 59, 59},
+               {59, 59, 59, 59},
+               {59, 59, 59, 59},
+               {60, 59, 59, 59},
+       },
+       {
+               {59, 59, 59, 59},
+               {60, 59, 60, 59},
+               {59, 59, 59, 59},
+               {60, 59, 60, 59},
+       },
+       {
+               {59, 60, 59, 60},
+               {60, 59, 60, 59},
+               {59, 60, 59, 60},
+               {60, 59, 60, 59},
+       },
+       {
+               {59, 60, 59, 60},
+               {60, 60, 60, 60},
+               {59, 60, 59, 60},
+               {60, 60, 60, 60},
+       },
+       {
+               {60, 60, 60, 60},
+               {60, 60, 60, 60},
+               {60, 60, 60, 60},
+               {60, 60, 60, 60},
+       },
+       {
+               {60, 60, 60, 60},
+               {61, 60, 61, 60},
+               {60, 60, 60, 60},
+               {61, 60, 61, 60},
+       },
+       {
+               {60, 61, 60, 61},
+               {61, 60, 61, 60},
+               {60, 61, 60, 61},
+               {61, 60, 61, 60},
+       },
+       {
+               {60, 61, 60, 61},
+               {61, 61, 61, 61},
+               {60, 61, 60, 61},
+               {61, 61, 61, 61},
+       },
+       {
+               {61, 61, 61, 61},
+               {61, 61, 61, 61},
+               {61, 61, 61, 61},
+               {61, 61, 61, 61},
+       },
+       {
+               {61, 61, 61, 61},
+               {62, 61, 62, 61},
+               {61, 61, 61, 61},
+               {62, 61, 62, 61},
+       },
+       {
+               {61, 62, 61, 62},
+               {62, 61, 62, 61},
+               {61, 62, 61, 62},
+               {62, 61, 62, 61},
+       },
+       {
+               {61, 62, 61, 62},
+               {62, 62, 62, 62},
+               {61, 62, 61, 62},
+               {62, 62, 62, 62},
+       },
+       {
+               {62, 62, 62, 62},
+               {62, 62, 62, 62},
+               {62, 62, 62, 62},
+               {62, 62, 62, 62},
+       },
+       {
+               {62, 62, 62, 62},
+               {63, 62, 63, 62},
+               {62, 62, 62, 62},
+               {63, 62, 63, 62},
+       },
+       {
+               {62, 63, 62, 63},
+               {63, 62, 63, 62},
+               {62, 63, 62, 63},
+               {63, 62, 63, 62},
+       },
+       {
+               {62, 63, 62, 63},
+               {63, 63, 63, 63},
+               {62, 63, 62, 63},
+               {63, 63, 63, 63},
+       },
+       {
+               {63, 63, 63, 63},
+               {63, 63, 63, 63},
+               {63, 63, 63, 63},
+               {63, 63, 63, 63},
+       },
+};
+
+static const uint8_t dither_rb2x2[256][2][2] =
+{
+       {
+               {0, 0},
+               {0, 0},
+       },
+       {
+               {0, 0},
+               {1, 0},
+       },
+       {
+               {0, 0},
+               {1, 0},
+       },
+       {
+               {0, 1},
+               {1, 0},
+       },
+       {
+               {0, 1},
+               {1, 0},
+       },
+       {
+               {0, 1},
+               {1, 1},
+       },
+       {
+               {0, 1},
+               {1, 1},
+       },
+       {
+               {1, 1},
+               {1, 1},
+       },
+       {
+               {1, 1},
+               {1, 1},
+       },
+       {
+               {1, 1},
+               {2, 1},
+       },
+       {
+               {1, 1},
+               {2, 1},
+       },
+       {
+               {1, 2},
+               {2, 1},
+       },
+       {
+               {1, 2},
+               {2, 1},
+       },
+       {
+               {1, 2},
+               {2, 2},
+       },
+       {
+               {1, 2},
+               {2, 2},
+       },
+       {
+               {2, 2},
+               {2, 2},
+       },
+       {
+               {2, 2},
+               {2, 2},
+       },
+       {
+               {2, 2},
+               {2, 2},
+       },
+       {
+               {2, 2},
+               {3, 2},
+       },
+       {
+               {2, 2},
+               {3, 2},
+       },
+       {
+               {2, 3},
+               {3, 2},
+       },
+       {
+               {2, 3},
+               {3, 2},
+       },
+       {
+               {2, 3},
+               {3, 3},
+       },
+       {
+               {2, 3},
+               {3, 3},
+       },
+       {
+               {3, 3},
+               {3, 3},
+       },
+       {
+               {3, 3},
+               {3, 3},
+       },
+       {
+               {3, 3},
+               {4, 3},
+       },
+       {
+               {3, 3},
+               {4, 3},
+       },
+       {
+               {3, 4},
+               {4, 3},
+       },
+       {
+               {3, 4},
+               {4, 3},
+       },
+       {
+               {3, 4},
+               {4, 4},
+       },
+       {
+               {3, 4},
+               {4, 4},
+       },
+       {
+               {4, 4},
+               {4, 4},
+       },
+       {
+               {4, 4},
+               {4, 4},
+       },
+       {
+               {4, 4},
+               {5, 4},
+       },
+       {
+               {4, 4},
+               {5, 4},
+       },
+       {
+               {4, 5},
+               {5, 4},
+       },
+       {
+               {4, 5},
+               {5, 4},
+       },
+       {
+               {4, 5},
+               {5, 5},
+       },
+       {
+               {4, 5},
+               {5, 5},
+       },
+       {
+               {5, 5},
+               {5, 5},
+       },
+       {
+               {5, 5},
+               {5, 5},
+       },
+       {
+               {5, 5},
+               {6, 5},
+       },
+       {
+               {5, 5},
+               {6, 5},
+       },
+       {
+               {5, 6},
+               {6, 5},
+       },
+       {
+               {5, 6},
+               {6, 5},
+       },
+       {
+               {5, 6},
+               {6, 6},
+       },
+       {
+               {5, 6},
+               {6, 6},
+       },
+       {
+               {5, 6},
+               {6, 6},
+       },
+       {
+               {6, 6},
+               {6, 6},
+       },
+       {
+               {6, 6},
+               {6, 6},
+       },
+       {
+               {6, 6},
+               {7, 6},
+       },
+       {
+               {6, 6},
+               {7, 6},
+       },
+       {
+               {6, 7},
+               {7, 6},
+       },
+       {
+               {6, 7},
+               {7, 6},
+       },
+       {
+               {6, 7},
+               {7, 7},
+       },
+       {
+               {6, 7},
+               {7, 7},
+       },
+       {
+               {7, 7},
+               {7, 7},
+       },
+       {
+               {7, 7},
+               {7, 7},
+       },
+       {
+               {7, 7},
+               {8, 7},
+       },
+       {
+               {7, 7},
+               {8, 7},
+       },
+       {
+               {7, 8},
+               {8, 7},
+       },
+       {
+               {7, 8},
+               {8, 7},
+       },
+       {
+               {7, 8},
+               {8, 8},
+       },
+       {
+               {7, 8},
+               {8, 8},
+       },
+       {
+               {8, 8},
+               {8, 8},
+       },
+       {
+               {8, 8},
+               {8, 8},
+       },
+       {
+               {8, 8},
+               {9, 8},
+       },
+       {
+               {8, 8},
+               {9, 8},
+       },
+       {
+               {8, 9},
+               {9, 8},
+       },
+       {
+               {8, 9},
+               {9, 8},
+       },
+       {
+               {8, 9},
+               {9, 9},
+       },
+       {
+               {8, 9},
+               {9, 9},
+       },
+       {
+               {9, 9},
+               {9, 9},
+       },
+       {
+               {9, 9},
+               {9, 9},
+       },
+       {
+               {9, 9},
+               {10, 9},
+       },
+       {
+               {9, 9},
+               {10, 9},
+       },
+       {
+               {9, 10},
+               {10, 9},
+       },
+       {
+               {9, 10},
+               {10, 9},
+       },
+       {
+               {9, 10},
+               {10, 10},
+       },
+       {
+               {9, 10},
+               {10, 10},
+       },
+       {
+               {9, 10},
+               {10, 10},
+       },
+       {
+               {10, 10},
+               {10, 10},
+       },
+       {
+               {10, 10},
+               {10, 10},
+       },
+       {
+               {10, 10},
+               {11, 10},
+       },
+       {
+               {10, 10},
+               {11, 10},
+       },
+       {
+               {10, 11},
+               {11, 10},
+       },
+       {
+               {10, 11},
+               {11, 10},
+       },
+       {
+               {10, 11},
+               {11, 11},
+       },
+       {
+               {10, 11},
+               {11, 11},
+       },
+       {
+               {11, 11},
+               {11, 11},
+       },
+       {
+               {11, 11},
+               {11, 11},
+       },
+       {
+               {11, 11},
+               {12, 11},
+       },
+       {
+               {11, 11},
+               {12, 11},
+       },
+       {
+               {11, 12},
+               {12, 11},
+       },
+       {
+               {11, 12},
+               {12, 11},
+       },
+       {
+               {11, 12},
+               {12, 12},
+       },
+       {
+               {11, 12},
+               {12, 12},
+       },
+       {
+               {12, 12},
+               {12, 12},
+       },
+       {
+               {12, 12},
+               {12, 12},
+       },
+       {
+               {12, 12},
+               {13, 12},
+       },
+       {
+               {12, 12},
+               {13, 12},
+       },
+       {
+               {12, 13},
+               {13, 12},
+       },
+       {
+               {12, 13},
+               {13, 12},
+       },
+       {
+               {12, 13},
+               {13, 13},
+       },
+       {
+               {12, 13},
+               {13, 13},
+       },
+       {
+               {13, 13},
+               {13, 13},
+       },
+       {
+               {13, 13},
+               {13, 13},
+       },
+       {
+               {13, 13},
+               {14, 13},
+       },
+       {
+               {13, 13},
+               {14, 13},
+       },
+       {
+               {13, 14},
+               {14, 13},
+       },
+       {
+               {13, 14},
+               {14, 13},
+       },
+       {
+               {13, 14},
+               {14, 13},
+       },
+       {
+               {13, 14},
+               {14, 14},
+       },
+       {
+               {13, 14},
+               {14, 14},
+       },
+       {
+               {14, 14},
+               {14, 14},
+       },
+       {
+               {14, 14},
+               {14, 14},
+       },
+       {
+               {14, 14},
+               {15, 14},
+       },
+       {
+               {14, 14},
+               {15, 14},
+       },
+       {
+               {14, 15},
+               {15, 14},
+       },
+       {
+               {14, 15},
+               {15, 14},
+       },
+       {
+               {14, 15},
+               {15, 15},
+       },
+       {
+               {14, 15},
+               {15, 15},
+       },
+       {
+               {15, 15},
+               {15, 15},
+       },
+       {
+               {15, 15},
+               {15, 15},
+       },
+       {
+               {15, 15},
+               {16, 15},
+       },
+       {
+               {15, 15},
+               {16, 15},
+       },
+       {
+               {15, 16},
+               {16, 15},
+       },
+       {
+               {15, 16},
+               {16, 15},
+       },
+       {
+               {15, 16},
+               {16, 16},
+       },
+       {
+               {15, 16},
+               {16, 16},
+       },
+       {
+               {16, 16},
+               {16, 16},
+       },
+       {
+               {16, 16},
+               {16, 16},
+       },
+       {
+               {16, 16},
+               {17, 16},
+       },
+       {
+               {16, 16},
+               {17, 16},
+       },
+       {
+               {16, 17},
+               {17, 16},
+       },
+       {
+               {16, 17},
+               {17, 16},
+       },
+       {
+               {16, 17},
+               {17, 17},
+       },
+       {
+               {16, 17},
+               {17, 17},
+       },
+       {
+               {17, 17},
+               {17, 17},
+       },
+       {
+               {17, 17},
+               {17, 17},
+       },
+       {
+               {17, 17},
+               {18, 17},
+       },
+       {
+               {17, 17},
+               {18, 17},
+       },
+       {
+               {17, 18},
+               {18, 17},
+       },
+       {
+               {17, 18},
+               {18, 17},
+       },
+       {
+               {17, 18},
+               {18, 18},
+       },
+       {
+               {17, 18},
+               {18, 18},
+       },
+       {
+               {18, 18},
+               {18, 18},
+       },
+       {
+               {18, 18},
+               {18, 18},
+       },
+       {
+               {18, 18},
+               {19, 18},
+       },
+       {
+               {18, 18},
+               {19, 18},
+       },
+       {
+               {18, 19},
+               {19, 18},
+       },
+       {
+               {18, 19},
+               {19, 18},
+       },
+       {
+               {18, 19},
+               {19, 19},
+       },
+       {
+               {18, 19},
+               {19, 19},
+       },
+       {
+               {19, 19},
+               {19, 19},
+       },
+       {
+               {19, 19},
+               {19, 19},
+       },
+       {
+               {19, 19},
+               {20, 19},
+       },
+       {
+               {19, 19},
+               {20, 19},
+       },
+       {
+               {19, 20},
+               {20, 19},
+       },
+       {
+               {19, 20},
+               {20, 19},
+       },
+       {
+               {19, 20},
+               {20, 19},
+       },
+       {
+               {19, 20},
+               {20, 20},
+       },
+       {
+               {19, 20},
+               {20, 20},
+       },
+       {
+               {20, 20},
+               {20, 20},
+       },
+       {
+               {20, 20},
+               {20, 20},
+       },
+       {
+               {20, 20},
+               {21, 20},
+       },
+       {
+               {20, 20},
+               {21, 20},
+       },
+       {
+               {20, 21},
+               {21, 20},
+       },
+       {
+               {20, 21},
+               {21, 20},
+       },
+       {
+               {20, 21},
+               {21, 21},
+       },
+       {
+               {20, 21},
+               {21, 21},
+       },
+       {
+               {21, 21},
+               {21, 21},
+       },
+       {
+               {21, 21},
+               {21, 21},
+       },
+       {
+               {21, 21},
+               {22, 21},
+       },
+       {
+               {21, 21},
+               {22, 21},
+       },
+       {
+               {21, 22},
+               {22, 21},
+       },
+       {
+               {21, 22},
+               {22, 21},
+       },
+       {
+               {21, 22},
+               {22, 22},
+       },
+       {
+               {21, 22},
+               {22, 22},
+       },
+       {
+               {22, 22},
+               {22, 22},
+       },
+       {
+               {22, 22},
+               {22, 22},
+       },
+       {
+               {22, 22},
+               {23, 22},
+       },
+       {
+               {22, 22},
+               {23, 22},
+       },
+       {
+               {22, 23},
+               {23, 22},
+       },
+       {
+               {22, 23},
+               {23, 22},
+       },
+       {
+               {22, 23},
+               {23, 23},
+       },
+       {
+               {22, 23},
+               {23, 23},
+       },
+       {
+               {23, 23},
+               {23, 23},
+       },
+       {
+               {23, 23},
+               {23, 23},
+       },
+       {
+               {23, 23},
+               {24, 23},
+       },
+       {
+               {23, 23},
+               {24, 23},
+       },
+       {
+               {23, 23},
+               {24, 23},
+       },
+       {
+               {23, 24},
+               {24, 23},
+       },
+       {
+               {23, 24},
+               {24, 23},
+       },
+       {
+               {23, 24},
+               {24, 24},
+       },
+       {
+               {23, 24},
+               {24, 24},
+       },
+       {
+               {24, 24},
+               {24, 24},
+       },
+       {
+               {24, 24},
+               {24, 24},
+       },
+       {
+               {24, 24},
+               {25, 24},
+       },
+       {
+               {24, 24},
+               {25, 24},
+       },
+       {
+               {24, 25},
+               {25, 24},
+       },
+       {
+               {24, 25},
+               {25, 24},
+       },
+       {
+               {24, 25},
+               {25, 25},
+       },
+       {
+               {24, 25},
+               {25, 25},
+       },
+       {
+               {25, 25},
+               {25, 25},
+       },
+       {
+               {25, 25},
+               {25, 25},
+       },
+       {
+               {25, 25},
+               {26, 25},
+       },
+       {
+               {25, 25},
+               {26, 25},
+       },
+       {
+               {25, 26},
+               {26, 25},
+       },
+       {
+               {25, 26},
+               {26, 25},
+       },
+       {
+               {25, 26},
+               {26, 26},
+       },
+       {
+               {25, 26},
+               {26, 26},
+       },
+       {
+               {26, 26},
+               {26, 26},
+       },
+       {
+               {26, 26},
+               {26, 26},
+       },
+       {
+               {26, 26},
+               {27, 26},
+       },
+       {
+               {26, 26},
+               {27, 26},
+       },
+       {
+               {26, 27},
+               {27, 26},
+       },
+       {
+               {26, 27},
+               {27, 26},
+       },
+       {
+               {26, 27},
+               {27, 27},
+       },
+       {
+               {26, 27},
+               {27, 27},
+       },
+       {
+               {27, 27},
+               {27, 27},
+       },
+       {
+               {27, 27},
+               {27, 27},
+       },
+       {
+               {27, 27},
+               {28, 27},
+       },
+       {
+               {27, 27},
+               {28, 27},
+       },
+       {
+               {27, 27},
+               {28, 27},
+       },
+       {
+               {27, 28},
+               {28, 27},
+       },
+       {
+               {27, 28},
+               {28, 27},
+       },
+       {
+               {27, 28},
+               {28, 28},
+       },
+       {
+               {27, 28},
+               {28, 28},
+       },
+       {
+               {28, 28},
+               {28, 28},
+       },
+       {
+               {28, 28},
+               {28, 28},
+       },
+       {
+               {28, 28},
+               {29, 28},
+       },
+       {
+               {28, 28},
+               {29, 28},
+       },
+       {
+               {28, 29},
+               {29, 28},
+       },
+       {
+               {28, 29},
+               {29, 28},
+       },
+       {
+               {28, 29},
+               {29, 29},
+       },
+       {
+               {28, 29},
+               {29, 29},
+       },
+       {
+               {29, 29},
+               {29, 29},
+       },
+       {
+               {29, 29},
+               {29, 29},
+       },
+       {
+               {29, 29},
+               {30, 29},
+       },
+       {
+               {29, 29},
+               {30, 29},
+       },
+       {
+               {29, 30},
+               {30, 29},
+       },
+       {
+               {29, 30},
+               {30, 29},
+       },
+       {
+               {29, 30},
+               {30, 30},
+       },
+       {
+               {29, 30},
+               {30, 30},
+       },
+       {
+               {30, 30},
+               {30, 30},
+       },
+       {
+               {30, 30},
+               {30, 30},
+       },
+       {
+               {30, 30},
+               {31, 30},
+       },
+       {
+               {30, 30},
+               {31, 30},
+       },
+       {
+               {30, 31},
+               {31, 30},
+       },
+       {
+               {30, 31},
+               {31, 30},
+       },
+       {
+               {30, 31},
+               {31, 31},
+       },
+       {
+               {30, 31},
+               {31, 31},
+       },
+       {
+               {31, 31},
+               {31, 31},
+       },
+       {
+               {31, 31},
+               {31, 31},
+       },
+};
+
+static const uint8_t dither_g2x2[256][2][2] =
+{
+       {
+               {0, 0},
+               {0, 0},
+       },
+       {
+               {0, 0},
+               {1, 0},
+       },
+       {
+               {0, 1},
+               {1, 0},
+       },
+       {
+               {0, 1},
+               {1, 1},
+       },
+       {
+               {1, 1},
+               {1, 1},
+       },
+       {
+               {1, 1},
+               {2, 1},
+       },
+       {
+               {1, 2},
+               {2, 1},
+       },
+       {
+               {1, 2},
+               {2, 2},
+       },
+       {
+               {2, 2},
+               {2, 2},
+       },
+       {
+               {2, 2},
+               {3, 2},
+       },
+       {
+               {2, 3},
+               {3, 2},
+       },
+       {
+               {2, 3},
+               {3, 3},
+       },
+       {
+               {3, 3},
+               {3, 3},
+       },
+       {
+               {3, 3},
+               {4, 3},
+       },
+       {
+               {3, 4},
+               {4, 3},
+       },
+       {
+               {3, 4},
+               {4, 4},
+       },
+       {
+               {4, 4},
+               {4, 4},
+       },
+       {
+               {4, 4},
+               {5, 4},
+       },
+       {
+               {4, 5},
+               {5, 4},
+       },
+       {
+               {4, 5},
+               {5, 5},
+       },
+       {
+               {5, 5},
+               {5, 5},
+       },
+       {
+               {5, 5},
+               {6, 5},
+       },
+       {
+               {5, 6},
+               {6, 5},
+       },
+       {
+               {5, 6},
+               {6, 6},
+       },
+       {
+               {6, 6},
+               {6, 6},
+       },
+       {
+               {6, 6},
+               {7, 6},
+       },
+       {
+               {6, 7},
+               {7, 6},
+       },
+       {
+               {6, 7},
+               {7, 7},
+       },
+       {
+               {7, 7},
+               {7, 7},
+       },
+       {
+               {7, 7},
+               {8, 7},
+       },
+       {
+               {7, 8},
+               {8, 7},
+       },
+       {
+               {7, 8},
+               {8, 8},
+       },
+       {
+               {8, 8},
+               {8, 8},
+       },
+       {
+               {8, 8},
+               {9, 8},
+       },
+       {
+               {8, 9},
+               {9, 8},
+       },
+       {
+               {8, 9},
+               {9, 9},
+       },
+       {
+               {9, 9},
+               {9, 9},
+       },
+       {
+               {9, 9},
+               {10, 9},
+       },
+       {
+               {9, 10},
+               {10, 9},
+       },
+       {
+               {9, 10},
+               {10, 10},
+       },
+       {
+               {10, 10},
+               {10, 10},
+       },
+       {
+               {10, 10},
+               {11, 10},
+       },
+       {
+               {10, 11},
+               {11, 10},
+       },
+       {
+               {10, 11},
+               {11, 11},
+       },
+       {
+               {11, 11},
+               {11, 11},
+       },
+       {
+               {11, 11},
+               {12, 11},
+       },
+       {
+               {11, 12},
+               {12, 11},
+       },
+       {
+               {11, 12},
+               {12, 12},
+       },
+       {
+               {11, 12},
+               {12, 12},
+       },
+       {
+               {12, 12},
+               {12, 12},
+       },
+       {
+               {12, 12},
+               {13, 12},
+       },
+       {
+               {12, 13},
+               {13, 12},
+       },
+       {
+               {12, 13},
+               {13, 13},
+       },
+       {
+               {13, 13},
+               {13, 13},
+       },
+       {
+               {13, 13},
+               {14, 13},
+       },
+       {
+               {13, 14},
+               {14, 13},
+       },
+       {
+               {13, 14},
+               {14, 14},
+       },
+       {
+               {14, 14},
+               {14, 14},
+       },
+       {
+               {14, 14},
+               {15, 14},
+       },
+       {
+               {14, 15},
+               {15, 14},
+       },
+       {
+               {14, 15},
+               {15, 15},
+       },
+       {
+               {15, 15},
+               {15, 15},
+       },
+       {
+               {15, 15},
+               {16, 15},
+       },
+       {
+               {15, 16},
+               {16, 15},
+       },
+       {
+               {15, 16},
+               {16, 16},
+       },
+       {
+               {16, 16},
+               {16, 16},
+       },
+       {
+               {16, 16},
+               {17, 16},
+       },
+       {
+               {16, 17},
+               {17, 16},
+       },
+       {
+               {16, 17},
+               {17, 17},
+       },
+       {
+               {17, 17},
+               {17, 17},
+       },
+       {
+               {17, 17},
+               {18, 17},
+       },
+       {
+               {17, 18},
+               {18, 17},
+       },
+       {
+               {17, 18},
+               {18, 18},
+       },
+       {
+               {18, 18},
+               {18, 18},
+       },
+       {
+               {18, 18},
+               {19, 18},
+       },
+       {
+               {18, 19},
+               {19, 18},
+       },
+       {
+               {18, 19},
+               {19, 19},
+       },
+       {
+               {19, 19},
+               {19, 19},
+       },
+       {
+               {19, 19},
+               {20, 19},
+       },
+       {
+               {19, 20},
+               {20, 19},
+       },
+       {
+               {19, 20},
+               {20, 20},
+       },
+       {
+               {20, 20},
+               {20, 20},
+       },
+       {
+               {20, 20},
+               {21, 20},
+       },
+       {
+               {20, 21},
+               {21, 20},
+       },
+       {
+               {20, 21},
+               {21, 21},
+       },
+       {
+               {21, 21},
+               {21, 21},
+       },
+       {
+               {21, 21},
+               {22, 21},
+       },
+       {
+               {21, 22},
+               {22, 21},
+       },
+       {
+               {21, 22},
+               {22, 22},
+       },
+       {
+               {22, 22},
+               {22, 22},
+       },
+       {
+               {22, 22},
+               {23, 22},
+       },
+       {
+               {22, 23},
+               {23, 22},
+       },
+       {
+               {22, 23},
+               {23, 23},
+       },
+       {
+               {23, 23},
+               {23, 23},
+       },
+       {
+               {23, 23},
+               {24, 23},
+       },
+       {
+               {23, 24},
+               {24, 23},
+       },
+       {
+               {23, 24},
+               {24, 24},
+       },
+       {
+               {24, 24},
+               {24, 24},
+       },
+       {
+               {24, 24},
+               {25, 24},
+       },
+       {
+               {24, 25},
+               {25, 24},
+       },
+       {
+               {24, 25},
+               {25, 25},
+       },
+       {
+               {25, 25},
+               {25, 25},
+       },
+       {
+               {25, 25},
+               {26, 25},
+       },
+       {
+               {25, 26},
+               {26, 25},
+       },
+       {
+               {25, 26},
+               {26, 26},
+       },
+       {
+               {26, 26},
+               {26, 26},
+       },
+       {
+               {26, 26},
+               {27, 26},
+       },
+       {
+               {26, 27},
+               {27, 26},
+       },
+       {
+               {26, 27},
+               {27, 27},
+       },
+       {
+               {27, 27},
+               {27, 27},
+       },
+       {
+               {27, 27},
+               {28, 27},
+       },
+       {
+               {27, 28},
+               {28, 27},
+       },
+       {
+               {27, 28},
+               {28, 28},
+       },
+       {
+               {28, 28},
+               {28, 28},
+       },
+       {
+               {28, 28},
+               {29, 28},
+       },
+       {
+               {28, 29},
+               {29, 28},
+       },
+       {
+               {28, 29},
+               {29, 29},
+       },
+       {
+               {29, 29},
+               {29, 29},
+       },
+       {
+               {29, 29},
+               {30, 29},
+       },
+       {
+               {29, 30},
+               {30, 29},
+       },
+       {
+               {29, 30},
+               {30, 30},
+       },
+       {
+               {30, 30},
+               {30, 30},
+       },
+       {
+               {30, 30},
+               {31, 30},
+       },
+       {
+               {30, 31},
+               {31, 30},
+       },
+       {
+               {30, 31},
+               {31, 31},
+       },
+       {
+               {31, 31},
+               {31, 31},
+       },
+       {
+               {31, 31},
+               {32, 31},
+       },
+       {
+               {31, 32},
+               {32, 31},
+       },
+       {
+               {31, 32},
+               {32, 32},
+       },
+       {
+               {32, 32},
+               {32, 32},
+       },
+       {
+               {32, 32},
+               {33, 32},
+       },
+       {
+               {32, 33},
+               {33, 32},
+       },
+       {
+               {32, 33},
+               {33, 33},
+       },
+       {
+               {33, 33},
+               {33, 33},
+       },
+       {
+               {33, 33},
+               {34, 33},
+       },
+       {
+               {33, 34},
+               {34, 33},
+       },
+       {
+               {33, 34},
+               {34, 34},
+       },
+       {
+               {34, 34},
+               {34, 34},
+       },
+       {
+               {34, 34},
+               {35, 34},
+       },
+       {
+               {34, 35},
+               {35, 34},
+       },
+       {
+               {34, 35},
+               {35, 35},
+       },
+       {
+               {35, 35},
+               {35, 35},
+       },
+       {
+               {35, 35},
+               {36, 35},
+       },
+       {
+               {35, 36},
+               {36, 35},
+       },
+       {
+               {35, 36},
+               {36, 35},
+       },
+       {
+               {35, 36},
+               {36, 36},
+       },
+       {
+               {36, 36},
+               {36, 36},
+       },
+       {
+               {36, 36},
+               {37, 36},
+       },
+       {
+               {36, 37},
+               {37, 36},
+       },
+       {
+               {36, 37},
+               {37, 37},
+       },
+       {
+               {37, 37},
+               {37, 37},
+       },
+       {
+               {37, 37},
+               {38, 37},
+       },
+       {
+               {37, 38},
+               {38, 37},
+       },
+       {
+               {37, 38},
+               {38, 38},
+       },
+       {
+               {38, 38},
+               {38, 38},
+       },
+       {
+               {38, 38},
+               {39, 38},
+       },
+       {
+               {38, 39},
+               {39, 38},
+       },
+       {
+               {38, 39},
+               {39, 39},
+       },
+       {
+               {39, 39},
+               {39, 39},
+       },
+       {
+               {39, 39},
+               {40, 39},
+       },
+       {
+               {39, 40},
+               {40, 39},
+       },
+       {
+               {39, 40},
+               {40, 40},
+       },
+       {
+               {40, 40},
+               {40, 40},
+       },
+       {
+               {40, 40},
+               {41, 40},
+       },
+       {
+               {40, 41},
+               {41, 40},
+       },
+       {
+               {40, 41},
+               {41, 41},
+       },
+       {
+               {41, 41},
+               {41, 41},
+       },
+       {
+               {41, 41},
+               {42, 41},
+       },
+       {
+               {41, 42},
+               {42, 41},
+       },
+       {
+               {41, 42},
+               {42, 42},
+       },
+       {
+               {42, 42},
+               {42, 42},
+       },
+       {
+               {42, 42},
+               {43, 42},
+       },
+       {
+               {42, 43},
+               {43, 42},
+       },
+       {
+               {42, 43},
+               {43, 43},
+       },
+       {
+               {43, 43},
+               {43, 43},
+       },
+       {
+               {43, 43},
+               {44, 43},
+       },
+       {
+               {43, 44},
+               {44, 43},
+       },
+       {
+               {43, 44},
+               {44, 44},
+       },
+       {
+               {44, 44},
+               {44, 44},
+       },
+       {
+               {44, 44},
+               {45, 44},
+       },
+       {
+               {44, 45},
+               {45, 44},
+       },
+       {
+               {44, 45},
+               {45, 45},
+       },
+       {
+               {45, 45},
+               {45, 45},
+       },
+       {
+               {45, 45},
+               {46, 45},
+       },
+       {
+               {45, 46},
+               {46, 45},
+       },
+       {
+               {45, 46},
+               {46, 46},
+       },
+       {
+               {46, 46},
+               {46, 46},
+       },
+       {
+               {46, 46},
+               {47, 46},
+       },
+       {
+               {46, 47},
+               {47, 46},
+       },
+       {
+               {46, 47},
+               {47, 47},
+       },
+       {
+               {47, 47},
+               {47, 47},
+       },
+       {
+               {47, 47},
+               {48, 47},
+       },
+       {
+               {47, 48},
+               {48, 47},
+       },
+       {
+               {47, 48},
+               {48, 48},
+       },
+       {
+               {48, 48},
+               {48, 48},
+       },
+       {
+               {48, 48},
+               {49, 48},
+       },
+       {
+               {48, 49},
+               {49, 48},
+       },
+       {
+               {48, 49},
+               {49, 49},
+       },
+       {
+               {49, 49},
+               {49, 49},
+       },
+       {
+               {49, 49},
+               {50, 49},
+       },
+       {
+               {49, 50},
+               {50, 49},
+       },
+       {
+               {49, 50},
+               {50, 50},
+       },
+       {
+               {50, 50},
+               {50, 50},
+       },
+       {
+               {50, 50},
+               {51, 50},
+       },
+       {
+               {50, 51},
+               {51, 50},
+       },
+       {
+               {50, 51},
+               {51, 51},
+       },
+       {
+               {51, 51},
+               {51, 51},
+       },
+       {
+               {51, 51},
+               {52, 51},
+       },
+       {
+               {51, 52},
+               {52, 51},
+       },
+       {
+               {51, 52},
+               {52, 52},
+       },
+       {
+               {52, 52},
+               {52, 52},
+       },
+       {
+               {52, 52},
+               {53, 52},
+       },
+       {
+               {52, 53},
+               {53, 52},
+       },
+       {
+               {52, 53},
+               {53, 53},
+       },
+       {
+               {53, 53},
+               {53, 53},
+       },
+       {
+               {53, 53},
+               {54, 53},
+       },
+       {
+               {53, 54},
+               {54, 53},
+       },
+       {
+               {53, 54},
+               {54, 54},
+       },
+       {
+               {54, 54},
+               {54, 54},
+       },
+       {
+               {54, 54},
+               {55, 54},
+       },
+       {
+               {54, 55},
+               {55, 54},
+       },
+       {
+               {54, 55},
+               {55, 55},
+       },
+       {
+               {55, 55},
+               {55, 55},
+       },
+       {
+               {55, 55},
+               {56, 55},
+       },
+       {
+               {55, 55},
+               {56, 55},
+       },
+       {
+               {55, 56},
+               {56, 55},
+       },
+       {
+               {55, 56},
+               {56, 56},
+       },
+       {
+               {56, 56},
+               {56, 56},
+       },
+       {
+               {56, 56},
+               {57, 56},
+       },
+       {
+               {56, 57},
+               {57, 56},
+       },
+       {
+               {56, 57},
+               {57, 57},
+       },
+       {
+               {57, 57},
+               {57, 57},
+       },
+       {
+               {57, 57},
+               {58, 57},
+       },
+       {
+               {57, 58},
+               {58, 57},
+       },
+       {
+               {57, 58},
+               {58, 58},
+       },
+       {
+               {58, 58},
+               {58, 58},
+       },
+       {
+               {58, 58},
+               {59, 58},
+       },
+       {
+               {58, 59},
+               {59, 58},
+       },
+       {
+               {58, 59},
+               {59, 59},
+       },
+       {
+               {59, 59},
+               {59, 59},
+       },
+       {
+               {59, 59},
+               {60, 59},
+       },
+       {
+               {59, 60},
+               {60, 59},
+       },
+       {
+               {59, 60},
+               {60, 60},
+       },
+       {
+               {60, 60},
+               {60, 60},
+       },
+       {
+               {60, 60},
+               {61, 60},
+       },
+       {
+               {60, 61},
+               {61, 60},
+       },
+       {
+               {60, 61},
+               {61, 61},
+       },
+       {
+               {61, 61},
+               {61, 61},
+       },
+       {
+               {61, 61},
+               {62, 61},
+       },
+       {
+               {61, 62},
+               {62, 61},
+       },
+       {
+               {61, 62},
+               {62, 62},
+       },
+       {
+               {62, 62},
+               {62, 62},
+       },
+       {
+               {62, 62},
+               {63, 62},
+       },
+       {
+               {62, 63},
+               {63, 62},
+       },
+       {
+               {62, 63},
+               {63, 63},
+       },
+       {
+               {63, 63},
+               {63, 63},
+       },
+};
+
diff --git a/pcem/vid_voodoo_fb.cpp b/pcem/vid_voodoo_fb.cpp
new file mode 100644 (file)
index 0000000..cbf8c17
--- /dev/null
@@ -0,0 +1,447 @@
+#include "ibm.h"
+#include "device.h"
+#include "mem.h"
+#include "thread.h"
+#include "video.h"
+#include "vid_svga.h"
+#include "vid_voodoo.h"
+#include "vid_voodoo_common.h"
+#include "vid_voodoo_dither.h"
+#include "vid_voodoo_regs.h"
+#include "vid_voodoo_render.h"
+#include "vid_voodoo_fb.h"
+
+uint16_t voodoo_fb_readw(uint32_t addr, void *p)
+{
+        voodoo_t *voodoo = (voodoo_t *)p;
+        int x, y;
+        uint32_t read_addr;
+        uint16_t temp;
+
+        if (voodoo->type >= VOODOO_BANSHEE)
+        {
+                x = addr & 0xffe;
+                y = (addr >> 12) & 0x3ff;
+        }
+        else
+        {
+                x = addr & 0x7fe;
+                y = (addr >> 11) & 0x3ff;
+        }
+
+        if (SLI_ENABLED)
+        {
+                voodoo_set_t *set = voodoo->set;
+
+                if (y & 1)
+                        voodoo = set->voodoos[1];
+                else
+                        voodoo = set->voodoos[0];
+
+                y >>= 1;
+        }
+
+        if (voodoo->col_tiled)
+                read_addr = voodoo->fb_read_offset + (x & 127) + (x >> 7) * 128*32 + (y & 31) * 128 + (y >> 5) * voodoo->row_width;
+        else
+                read_addr = voodoo->fb_read_offset + x + (y * voodoo->row_width);
+
+        if (read_addr > voodoo->fb_mask)
+                return 0xffff;
+
+        temp = *(uint16_t *)(&voodoo->fb_mem[read_addr & voodoo->fb_mask]);
+
+//        pclog("voodoo_fb_readw : %08X %08X  %i %i  %08X %08X  %08x:%08x %i\n", addr, temp, x, y, read_addr, *(uint32_t *)(&voodoo->fb_mem[4]), cs, pc, fb_reads++);
+        return temp;
+}
+uint32_t voodoo_fb_readl(uint32_t addr, void *p)
+{
+        voodoo_t *voodoo = (voodoo_t *)p;
+        int x, y;
+        uint32_t read_addr;
+        uint32_t temp;
+
+        if (voodoo->type >= VOODOO_BANSHEE)
+        {
+                x = addr & 0xffe;
+                y = (addr >> 12) & 0x3ff;
+        }
+        else
+        {
+                x = addr & 0x7fe;
+                y = (addr >> 11) & 0x3ff;
+        }
+
+        if (SLI_ENABLED)
+        {
+                voodoo_set_t *set = voodoo->set;
+
+                if (y & 1)
+                        voodoo = set->voodoos[1];
+                else
+                        voodoo = set->voodoos[0];
+
+                y >>= 1;
+        }
+
+        if (voodoo->col_tiled)
+                read_addr = voodoo->fb_read_offset + (x & 127) + (x >> 7) * 128*32 + (y & 31) * 128 + (y >> 5) * voodoo->row_width;
+        else
+                read_addr = voodoo->fb_read_offset + x + (y * voodoo->row_width);
+
+        if (read_addr > voodoo->fb_mask)
+                return 0xffffffff;
+
+        temp = *(uint32_t *)(&voodoo->fb_mem[read_addr & voodoo->fb_mask]);
+
+//        pclog("voodoo_fb_readl : %08X %08x %08X  x=%i y=%i  %08X %08X  %08x:%08x %i ro=%08x rw=%i\n", addr, read_addr, temp, x, y, read_addr, *(uint32_t *)(&voodoo->fb_mem[4]), cs, pc, fb_reads++, voodoo->fb_read_offset, voodoo->row_width);
+        return temp;
+}
+
+static inline uint16_t do_dither(voodoo_params_t *params, rgba8_t col, int x, int y)
+{
+        int r, g, b;
+
+        if (dither)
+        {
+                if (dither2x2)
+                {
+                        r = dither_rb2x2[col.r][y & 1][x & 1];
+                        g =  dither_g2x2[col.g][y & 1][x & 1];
+                        b = dither_rb2x2[col.b][y & 1][x & 1];
+                }
+                else
+                {
+                        r = dither_rb[col.r][y & 3][x & 3];
+                        g =  dither_g[col.g][y & 3][x & 3];
+                        b = dither_rb[col.b][y & 3][x & 3];
+                }
+        }
+        else
+        {
+                r = col.r >> 3;
+                g = col.g >> 2;
+                b = col.b >> 3;
+        }
+
+        return b | (g << 5) | (r << 11);
+}
+
+void voodoo_fb_writew(uint32_t addr, uint16_t val, void *p)
+{
+        voodoo_t *voodoo = (voodoo_t *)p;
+        voodoo_params_t *params = &voodoo->params;
+        int x, y;
+        uint32_t write_addr, write_addr_aux;
+        rgba8_t colour_data;
+        uint16_t depth_data;
+        uint8_t alpha_data;
+        int write_mask = 0;
+
+        colour_data.r = colour_data.g = colour_data.b = colour_data.a = 0;
+
+        depth_data = voodoo->params.zaColor & 0xffff;
+        alpha_data = voodoo->params.zaColor >> 24;
+
+//        while (!RB_EMPTY)
+//                thread_reset_event(voodoo->not_full_event);
+
+//        pclog("voodoo_fb_writew : %08X %04X\n", addr, val);
+
+
+        switch (voodoo->lfbMode & LFB_FORMAT_MASK)
+        {
+                case LFB_FORMAT_RGB565:
+                colour_data = rgb565[val];
+                alpha_data = 0xff;
+                write_mask = LFB_WRITE_COLOUR;
+                break;
+                case LFB_FORMAT_RGB555:
+                colour_data = argb1555[val];
+                alpha_data = 0xff;
+                write_mask = LFB_WRITE_COLOUR;
+                break;
+                case LFB_FORMAT_ARGB1555:
+                colour_data = argb1555[val];
+                alpha_data = colour_data.a;
+                write_mask = LFB_WRITE_COLOUR;
+                break;
+                case LFB_FORMAT_DEPTH:
+                depth_data = val;
+                write_mask = LFB_WRITE_DEPTH;
+                break;
+
+                default:
+                fatal("voodoo_fb_writew : bad LFB format %08X\n", voodoo->lfbMode);
+        }
+
+        if (voodoo->type >= VOODOO_BANSHEE)
+        {
+                x = addr & 0xffe;
+                y = (addr >> 12) & 0x3ff;
+        }
+        else
+        {
+                x = addr & 0x7fe;
+                y = (addr >> 11) & 0x3ff;
+        }
+
+        if (SLI_ENABLED)
+        {
+                if ((!(voodoo->initEnable & INITENABLE_SLI_MASTER_SLAVE) && (y & 1)) ||
+                    ((voodoo->initEnable & INITENABLE_SLI_MASTER_SLAVE) && !(y & 1)))
+                        return;
+                y >>= 1;
+        }
+
+
+        if (voodoo->fb_write_offset == voodoo->params.front_offset && y < 2048)
+                voodoo->dirty_line[y] = 1;
+
+        if (voodoo->col_tiled)
+                write_addr = voodoo->fb_write_offset + (x & 127) + (x >> 7) * 128*32 + (y & 31) * 128 + (y >> 5) * voodoo->row_width;
+        else
+                write_addr = voodoo->fb_write_offset + x + (y * voodoo->row_width);
+        if (voodoo->aux_tiled)
+                write_addr_aux = voodoo->params.aux_offset + (x & 127) + (x >> 7) * 128*32 + (y & 31) * 128 + (y >> 5) * voodoo->row_width;
+        else
+                write_addr_aux = voodoo->params.aux_offset + x + (y * voodoo->row_width);
+
+//        pclog("fb_writew %08x %i %i %i %08x\n", addr, x, y, voodoo->row_width, write_addr);
+
+        if (voodoo->lfbMode & 0x100)
+        {
+                {
+                        rgba8_t write_data = colour_data;
+                        uint16_t new_depth = depth_data;
+
+                        if (params->fbzMode & FBZ_DEPTH_ENABLE)
+                        {
+                                uint16_t old_depth = *(uint16_t *)(&voodoo->fb_mem[write_addr_aux & voodoo->fb_mask]);
+
+                                DEPTH_TEST(new_depth);
+                        }
+
+                        if ((params->fbzMode & FBZ_CHROMAKEY) &&
+                                write_data.r == params->chromaKey_r &&
+                                write_data.g == params->chromaKey_g &&
+                                write_data.b == params->chromaKey_b)
+                                goto skip_pixel;
+
+                        if (params->fogMode & FOG_ENABLE)
+                        {
+                                int32_t z = new_depth << 12;
+                                int64_t w_depth = (int64_t)(int32_t)new_depth;
+                                int32_t ia = alpha_data << 12;
+
+                                APPLY_FOG(write_data.r, write_data.g, write_data.b, z, ia, w_depth);
+                        }
+
+                        if (params->alphaMode & 1)
+                                ALPHA_TEST(alpha_data);
+
+                        if (params->alphaMode & (1 << 4))
+                        {
+                                uint16_t dat = *(uint16_t *)(&voodoo->fb_mem[write_addr & voodoo->fb_mask]);
+                                int dest_r, dest_g, dest_b, dest_a;
+
+                                dest_r = (dat >> 8) & 0xf8;
+                                dest_g = (dat >> 3) & 0xfc;
+                                dest_b = (dat << 3) & 0xf8;
+                                dest_r |= (dest_r >> 5);
+                                dest_g |= (dest_g >> 6);
+                                dest_b |= (dest_b >> 5);
+                                dest_a = 0xff;
+
+                                ALPHA_BLEND(write_data.r, write_data.g, write_data.b, alpha_data);
+                        }
+
+                        if (params->fbzMode & FBZ_RGB_WMASK)
+                                *(uint16_t *)(&voodoo->fb_mem[write_addr & voodoo->fb_mask]) = do_dither(&voodoo->params, write_data, x >> 1, y);
+                        if (params->fbzMode & FBZ_DEPTH_WMASK)
+                                *(uint16_t *)(&voodoo->fb_mem[write_addr_aux & voodoo->fb_mask]) = new_depth;
+
+skip_pixel:
+                        x = x;
+                }
+        }
+        else
+        {
+                if (write_mask & LFB_WRITE_COLOUR)
+                        *(uint16_t *)(&voodoo->fb_mem[write_addr & voodoo->fb_mask]) = do_dither(&voodoo->params, colour_data, x >> 1, y);
+                if (write_mask & LFB_WRITE_DEPTH)
+                        *(uint16_t *)(&voodoo->fb_mem[write_addr_aux & voodoo->fb_mask]) = depth_data;
+        }
+}
+
+
+void voodoo_fb_writel(uint32_t addr, uint32_t val, void *p)
+{
+        voodoo_t *voodoo = (voodoo_t *)p;
+        voodoo_params_t *params = &voodoo->params;
+        int x, y;
+        uint32_t write_addr, write_addr_aux;
+        rgba8_t colour_data[2];
+        uint16_t depth_data[2];
+        uint8_t alpha_data[2];
+        int write_mask = 0, count = 1;
+
+        depth_data[0] = depth_data[1] = voodoo->params.zaColor & 0xffff;
+        alpha_data[0] = alpha_data[1] = voodoo->params.zaColor >> 24;
+//        while (!RB_EMPTY)
+//                thread_reset_event(voodoo->not_full_event);
+
+//        pclog("voodoo_fb_writel : %08X %08X\n", addr, val);
+
+        switch (voodoo->lfbMode & LFB_FORMAT_MASK)
+        {
+                case LFB_FORMAT_RGB565:
+                colour_data[0] = rgb565[val & 0xffff];
+                colour_data[1] = rgb565[val >> 16];
+                write_mask = LFB_WRITE_COLOUR;
+                count = 2;
+                break;
+                case LFB_FORMAT_RGB555:
+                colour_data[0] = argb1555[val & 0xffff];
+                colour_data[1] = argb1555[val >> 16];
+                write_mask = LFB_WRITE_COLOUR;
+                count = 2;
+                break;
+                case LFB_FORMAT_ARGB1555:
+                colour_data[0] = argb1555[val & 0xffff];
+                alpha_data[0] = colour_data[0].a;
+                colour_data[1] = argb1555[val >> 16];
+                alpha_data[1] = colour_data[1].a;
+                write_mask = LFB_WRITE_COLOUR;
+                count = 2;
+                break;
+
+                case LFB_FORMAT_ARGB8888:
+                colour_data[0].b = val & 0xff;
+                colour_data[0].g = (val >> 8) & 0xff;
+                colour_data[0].r = (val >> 16) & 0xff;
+                alpha_data[0] = (val >> 24) & 0xff;
+                write_mask = LFB_WRITE_COLOUR;
+                addr >>= 1;
+                break;
+
+                case LFB_FORMAT_DEPTH:
+                depth_data[0] = val;
+                depth_data[1] = val >> 16;
+                write_mask = LFB_WRITE_DEPTH;
+                count = 2;
+                break;
+
+                default:
+                fatal("voodoo_fb_writel : bad LFB format %08X\n", voodoo->lfbMode);
+        }
+
+        if (voodoo->type >= VOODOO_BANSHEE)
+        {
+                x = addr & 0xffe;
+                y = (addr >> 12) & 0x3ff;
+        }
+        else
+        {
+                x = addr & 0x7fe;
+                y = (addr >> 11) & 0x3ff;
+        }
+
+        if (SLI_ENABLED)
+        {
+                if ((!(voodoo->initEnable & INITENABLE_SLI_MASTER_SLAVE) && (y & 1)) ||
+                    ((voodoo->initEnable & INITENABLE_SLI_MASTER_SLAVE) && !(y & 1)))
+                        return;
+                y >>= 1;
+        }
+
+        if (voodoo->fb_write_offset == voodoo->params.front_offset && y < 2048)
+                voodoo->dirty_line[y] = 1;
+
+        if (voodoo->col_tiled)
+                write_addr = voodoo->fb_write_offset + (x & 127) + (x >> 7) * 128*32 + (y & 31) * 128 + (y >> 5) * voodoo->row_width;
+        else
+                write_addr = voodoo->fb_write_offset + x + (y * voodoo->row_width);
+        if (voodoo->aux_tiled)
+                write_addr_aux = voodoo->params.aux_offset + (x & 127) + (x >> 7) * 128*32 + (y & 31) * 128 + (y >> 5) * voodoo->row_width;
+        else
+                write_addr_aux = voodoo->params.aux_offset + x + (y * voodoo->row_width);
+
+//        pclog("fb_writel %08x x=%i y=%i rw=%i %08x wo=%08x\n", addr, x, y, voodoo->row_width, write_addr, voodoo->fb_write_offset);
+
+        if (voodoo->lfbMode & 0x100)
+        {
+                int c;
+
+                for (c = 0; c < count; c++)
+                {
+                        rgba8_t write_data = colour_data[c];
+                        uint16_t new_depth = depth_data[c];
+
+                        if (params->fbzMode & FBZ_DEPTH_ENABLE)
+                        {
+                                uint16_t old_depth = *(uint16_t *)(&voodoo->fb_mem[write_addr_aux & voodoo->fb_mask]);
+
+                                DEPTH_TEST(new_depth);
+                        }
+
+                        if ((params->fbzMode & FBZ_CHROMAKEY) &&
+                                write_data.r == params->chromaKey_r &&
+                                write_data.g == params->chromaKey_g &&
+                                write_data.b == params->chromaKey_b)
+                                goto skip_pixel;
+
+                        if (params->fogMode & FOG_ENABLE)
+                        {
+                                int32_t z = new_depth << 12;
+                                int64_t w_depth = new_depth;
+                                int32_t ia = alpha_data[c] << 12;
+
+                                APPLY_FOG(write_data.r, write_data.g, write_data.b, z, ia, w_depth);
+                        }
+
+                        if (params->alphaMode & 1)
+                                ALPHA_TEST(alpha_data[c]);
+
+                        if (params->alphaMode & (1 << 4))
+                        {
+                                uint16_t dat = *(uint16_t *)(&voodoo->fb_mem[write_addr & voodoo->fb_mask]);
+                                int dest_r, dest_g, dest_b, dest_a;
+
+                                dest_r = (dat >> 8) & 0xf8;
+                                dest_g = (dat >> 3) & 0xfc;
+                                dest_b = (dat << 3) & 0xf8;
+                                dest_r |= (dest_r >> 5);
+                                dest_g |= (dest_g >> 6);
+                                dest_b |= (dest_b >> 5);
+                                dest_a = 0xff;
+
+                                ALPHA_BLEND(write_data.r, write_data.g, write_data.b, alpha_data[c]);
+                        }
+
+                        if (params->fbzMode & FBZ_RGB_WMASK)
+                                *(uint16_t *)(&voodoo->fb_mem[write_addr & voodoo->fb_mask]) = do_dither(&voodoo->params, write_data, (x >> 1) + c, y);
+                        if (params->fbzMode & FBZ_DEPTH_WMASK)
+                                *(uint16_t *)(&voodoo->fb_mem[write_addr_aux & voodoo->fb_mask]) = new_depth;
+
+skip_pixel:
+                        write_addr += 2;
+                        write_addr_aux += 2;
+                }
+        }
+        else
+        {
+                int c;
+
+                for (c = 0; c < count; c++)
+                {
+                        if (write_mask & LFB_WRITE_COLOUR)
+                                *(uint16_t *)(&voodoo->fb_mem[write_addr & voodoo->fb_mask]) = do_dither(&voodoo->params, colour_data[c], (x >> 1) + c, y);
+                        if (write_mask & LFB_WRITE_DEPTH)
+                                *(uint16_t *)(&voodoo->fb_mem[write_addr_aux & voodoo->fb_mask]) = depth_data[c];
+
+                        write_addr += 2;
+                        write_addr_aux += 2;
+                }
+        }
+}
diff --git a/pcem/vid_voodoo_fb.h b/pcem/vid_voodoo_fb.h
new file mode 100644 (file)
index 0000000..fcb2513
--- /dev/null
@@ -0,0 +1,4 @@
+uint16_t voodoo_fb_readw(uint32_t addr, void *p);
+uint32_t voodoo_fb_readl(uint32_t addr, void *p);
+void voodoo_fb_writew(uint32_t addr, uint16_t val, void *p);
+void voodoo_fb_writel(uint32_t addr, uint32_t val, void *p);
diff --git a/pcem/vid_voodoo_fifo.cpp b/pcem/vid_voodoo_fifo.cpp
new file mode 100644 (file)
index 0000000..59861f3
--- /dev/null
@@ -0,0 +1,503 @@
+#include <math.h>
+#include <stddef.h>
+#include "ibm.h"
+#include "device.h"
+#include "mem.h"
+#include "thread.h"
+#include "video.h"
+#include "vid_svga.h"
+#include "vid_voodoo.h"
+#include "vid_voodoo_common.h"
+#include "vid_voodoo_banshee_blitter.h"
+#include "vid_voodoo_fb.h"
+#include "vid_voodoo_fifo.h"
+#include "vid_voodoo_reg.h"
+#include "vid_voodoo_regs.h"
+#include "vid_voodoo_render.h"
+#include "vid_voodoo_texture.h"
+
+#define WAKE_DELAY (TIMER_USEC * 100)
+void voodoo_wake_fifo_thread(voodoo_t *voodoo)
+{
+        if (!timer_is_enabled(&voodoo->wake_timer))
+        {
+                /*Don't wake FIFO thread immediately - if we do that it will probably
+                  process one word and go back to sleep, requiring it to be woken on
+                  almost every write. Instead, wait a short while so that the CPU
+                  emulation writes more data so we have more batched-up work.*/
+               timer_set_delay_u64(&voodoo->wake_timer, WAKE_DELAY);
+        }
+}
+
+void voodoo_wake_fifo_thread_now(voodoo_t *voodoo)
+{
+        thread_set_event(voodoo->wake_fifo_thread); /*Wake up FIFO thread if moving from idle*/
+}
+
+void voodoo_wake_timer(void *p)
+{
+        voodoo_t *voodoo = (voodoo_t *)p;
+
+        thread_set_event(voodoo->wake_fifo_thread); /*Wake up FIFO thread if moving from idle*/
+}
+
+void voodoo_queue_command(voodoo_t *voodoo, uint32_t addr_type, uint32_t val)
+{
+        fifo_entry_t *fifo = &voodoo->fifo[voodoo->fifo_write_idx & FIFO_MASK];
+
+        while (FIFO_FULL)
+        {
+                thread_reset_event(voodoo->fifo_not_full_event);
+                if (FIFO_FULL)
+                {
+                        thread_wait_event(voodoo->fifo_not_full_event, 1); /*Wait for room in ringbuffer*/
+                        if (FIFO_FULL)
+                                voodoo_wake_fifo_thread_now(voodoo);
+                }
+        }
+
+        fifo->val = val;
+        fifo->addr_type = addr_type;
+
+        voodoo->fifo_write_idx++;
+
+        if (FIFO_ENTRIES > 0xe000)
+                voodoo_wake_fifo_thread(voodoo);
+}
+
+void voodoo_flush(voodoo_t *voodoo)
+{
+        voodoo->flush = 1;
+        while (!FIFO_EMPTY)
+        {
+                voodoo_wake_fifo_thread_now(voodoo);
+                thread_wait_event(voodoo->fifo_not_full_event, 1);
+        }
+        voodoo_wait_for_render_thread_idle(voodoo);
+        voodoo->flush = 0;
+}
+
+void voodoo_wake_fifo_threads(voodoo_set_t *set, voodoo_t *voodoo)
+{
+        voodoo_wake_fifo_thread(voodoo);
+        if (SLI_ENABLED && voodoo->type != VOODOO_2 && set->voodoos[0] == voodoo)
+                voodoo_wake_fifo_thread(set->voodoos[1]);
+}
+
+void voodoo_wait_for_swap_complete(voodoo_t *voodoo)
+{
+        while (voodoo->swap_pending)
+        {
+                thread_wait_event(voodoo->wake_fifo_thread, -1);
+                thread_reset_event(voodoo->wake_fifo_thread);
+
+                thread_lock_mutex(voodoo->swap_mutex);
+                if ((voodoo->swap_pending && voodoo->flush) || FIFO_FULL)
+                {
+                        /*Main thread is waiting for FIFO to empty, so skip vsync wait and just swap*/
+                        memset(voodoo->dirty_line, 1, sizeof(voodoo->dirty_line));
+                        voodoo->front_offset = voodoo->params.front_offset;
+                        if (voodoo->swap_count > 0)
+                                voodoo->swap_count--;
+                        voodoo->swap_pending = 0;
+                        thread_unlock_mutex(voodoo->swap_mutex);
+                        break;
+                }
+                else
+                        thread_unlock_mutex(voodoo->swap_mutex);
+        }
+}
+
+
+static uint32_t cmdfifo_get(voodoo_t *voodoo)
+{
+        uint32_t val;
+
+        if (!voodoo->cmdfifo_in_sub)
+        {
+                while (voodoo->cmdfifo_depth_rd == voodoo->cmdfifo_depth_wr)
+                {
+                        thread_wait_event(voodoo->wake_fifo_thread, -1);
+                        thread_reset_event(voodoo->wake_fifo_thread);
+                }
+        }
+
+        val = *(uint32_t *)&voodoo->fb_mem[voodoo->cmdfifo_rp & voodoo->fb_mask];
+
+        if (!voodoo->cmdfifo_in_sub)
+                voodoo->cmdfifo_depth_rd++;
+        voodoo->cmdfifo_rp += 4;
+
+//        pclog("  CMDFIFO get %08x\n", val);
+        return val;
+}
+
+static inline float cmdfifo_get_f(voodoo_t *voodoo)
+{
+        union
+        {
+                uint32_t i;
+                float f;
+        } tempif;
+
+        tempif.i = cmdfifo_get(voodoo);
+        return tempif.f;
+}
+
+enum
+{
+        CMDFIFO3_PC_MASK_RGB   = (1 << 10),
+        CMDFIFO3_PC_MASK_ALPHA = (1 << 11),
+        CMDFIFO3_PC_MASK_Z     = (1 << 12),
+        CMDFIFO3_PC_MASK_Wb    = (1 << 13),
+        CMDFIFO3_PC_MASK_W0    = (1 << 14),
+        CMDFIFO3_PC_MASK_S0_T0 = (1 << 15),
+        CMDFIFO3_PC_MASK_W1    = (1 << 16),
+        CMDFIFO3_PC_MASK_S1_T1 = (1 << 17),
+
+        CMDFIFO3_PC = (1 << 28)
+};
+
+void voodoo_fifo_thread(void *param)
+{
+        voodoo_t *voodoo = (voodoo_t *)param;
+
+        while (1)
+        {
+                thread_set_event(voodoo->fifo_not_full_event);
+                thread_wait_event(voodoo->wake_fifo_thread, -1);
+                thread_reset_event(voodoo->wake_fifo_thread);
+                voodoo->voodoo_busy = 1;
+                while (!FIFO_EMPTY)
+                {
+                        uint64_t start_time = timer_read();
+                        uint64_t end_time;
+                        fifo_entry_t *fifo = &voodoo->fifo[voodoo->fifo_read_idx & FIFO_MASK];
+
+                        switch (fifo->addr_type & FIFO_TYPE)
+                        {
+                                case FIFO_WRITEL_REG:
+                                while ((fifo->addr_type & FIFO_TYPE) == FIFO_WRITEL_REG)
+                                {
+                                        voodoo_reg_writel(fifo->addr_type & FIFO_ADDR, fifo->val, voodoo);
+                                        fifo->addr_type = FIFO_INVALID;
+                                        voodoo->fifo_read_idx++;
+                                        if (FIFO_EMPTY)
+                                                break;
+                                        fifo = &voodoo->fifo[voodoo->fifo_read_idx & FIFO_MASK];
+                                }
+                                break;
+                                case FIFO_WRITEW_FB:
+                                voodoo_wait_for_render_thread_idle(voodoo);
+                                while ((fifo->addr_type & FIFO_TYPE) == FIFO_WRITEW_FB)
+                                {
+                                        voodoo_fb_writew(fifo->addr_type & FIFO_ADDR, fifo->val, voodoo);
+                                        fifo->addr_type = FIFO_INVALID;
+                                        voodoo->fifo_read_idx++;
+                                        if (FIFO_EMPTY)
+                                                break;
+                                        fifo = &voodoo->fifo[voodoo->fifo_read_idx & FIFO_MASK];
+                                }
+                                break;
+                                case FIFO_WRITEL_FB:
+                                voodoo_wait_for_render_thread_idle(voodoo);
+                                while ((fifo->addr_type & FIFO_TYPE) == FIFO_WRITEL_FB)
+                                {
+                                        voodoo_fb_writel(fifo->addr_type & FIFO_ADDR, fifo->val, voodoo);
+                                        fifo->addr_type = FIFO_INVALID;
+                                        voodoo->fifo_read_idx++;
+                                        if (FIFO_EMPTY)
+                                                break;
+                                        fifo = &voodoo->fifo[voodoo->fifo_read_idx & FIFO_MASK];
+                                }
+                                break;
+                                case FIFO_WRITEL_TEX:
+                                while ((fifo->addr_type & FIFO_TYPE) == FIFO_WRITEL_TEX)
+                                {
+                                        if (!(fifo->addr_type & 0x400000))
+                                                voodoo_tex_writel(fifo->addr_type & FIFO_ADDR, fifo->val, voodoo);
+                                        fifo->addr_type = FIFO_INVALID;
+                                        voodoo->fifo_read_idx++;
+                                        if (FIFO_EMPTY)
+                                                break;
+                                        fifo = &voodoo->fifo[voodoo->fifo_read_idx & FIFO_MASK];
+                                }
+                                break;
+                                case FIFO_WRITEL_2DREG:
+                                while ((fifo->addr_type & FIFO_TYPE) == FIFO_WRITEL_2DREG)
+                                {
+                                        voodoo_2d_reg_writel(voodoo, fifo->addr_type & FIFO_ADDR, fifo->val);
+                                        fifo->addr_type = FIFO_INVALID;
+                                        voodoo->fifo_read_idx++;
+                                        if (FIFO_EMPTY)
+                                                break;
+                                        fifo = &voodoo->fifo[voodoo->fifo_read_idx & FIFO_MASK];
+                                }
+                                break;
+
+                                default:
+                                fatal("Unknown fifo entry %08x\n", fifo->addr_type);
+                        }
+
+                        if (FIFO_ENTRIES > 0xe000)
+                                thread_set_event(voodoo->fifo_not_full_event);
+
+                        end_time = timer_read();
+                        voodoo->time += end_time - start_time;
+                }
+
+                while (voodoo->cmdfifo_enabled && (voodoo->cmdfifo_depth_rd != voodoo->cmdfifo_depth_wr || voodoo->cmdfifo_in_sub))
+                {
+                        uint64_t start_time = timer_read();
+                        uint64_t end_time;
+                        uint32_t header = cmdfifo_get(voodoo);
+                        uint32_t addr;
+                        uint32_t mask;
+                        int smode;
+                        int num;
+                        int num_verticies;
+                        int v_num;
+
+//                        pclog(" CMDFIFO header %08x at %08x\n", header, voodoo->cmdfifo_rp);
+
+                        switch (header & 7)
+                        {
+                                case 0:
+//                                pclog("CMDFIFO0\n");
+                                switch ((header >> 3) & 7)
+                                {
+                                        case 0: /*NOP*/
+                                        break;
+
+                                        case 1: /*JSR*/
+//                                        pclog("JSR %08x\n", (header >> 4) & 0xfffffc);
+                                        voodoo->cmdfifo_ret_addr = voodoo->cmdfifo_rp;
+                                        voodoo->cmdfifo_rp = (header >> 4) & 0xfffffc;
+                                        voodoo->cmdfifo_in_sub = 1;
+                                        break;
+
+                                        case 2: /*RET*/
+                                        voodoo->cmdfifo_rp = voodoo->cmdfifo_ret_addr;
+                                        voodoo->cmdfifo_in_sub = 0;
+                                        break;
+
+                                        case 3: /*JMP local frame buffer*/
+                                        voodoo->cmdfifo_rp = (header >> 4) & 0xfffffc;
+//                                        pclog("JMP to %08x %04x\n", voodoo->cmdfifo_rp, header);
+                                        break;
+
+                                        default:
+                                        fatal("Bad CMDFIFO0 %08x\n", header);
+                                }
+                                break;
+
+                                case 1:
+                                num = header >> 16;
+                                addr = (header & 0x7ff8) >> 1;
+//                                pclog("CMDFIFO1 addr=%08x\n",addr);
+                                while (num--)
+                                {
+                                        uint32_t val = cmdfifo_get(voodoo);
+                                        if ((addr & (1 << 13)) && voodoo->type >= VOODOO_BANSHEE)
+                                        {
+//                                                if (voodoo->type != VOODOO_BANSHEE)
+//                                                        fatal("CMDFIFO1: Not Banshee\n");
+//                                                pclog("CMDFIFO1: write %08x %08x\n", addr, val);
+                                                voodoo_2d_reg_writel(voodoo, addr, val);
+                                        }
+                                        else
+                                        {
+                                                if ((addr & 0x3ff) == SST_triangleCMD || (addr & 0x3ff) == SST_ftriangleCMD ||
+                                                    (addr & 0x3ff) == SST_fastfillCMD || (addr & 0x3ff) == SST_nopCMD)
+                                                        voodoo->cmd_written_fifo++;
+
+                                                if (voodoo->type >= VOODOO_BANSHEE && (addr & 0x3ff) == SST_swapbufferCMD)
+                                                        voodoo->cmd_written_fifo++;
+                                                voodoo_reg_writel(addr, val, voodoo);
+                                        }
+
+                                        if (header & (1 << 15))
+                                                addr += 4;
+                                }
+                                break;
+
+                                case 2:
+                                if (voodoo->type < VOODOO_BANSHEE)
+                                        fatal("CMDFIFO2: Not Banshee\n");
+                                mask = (header >> 3);
+                                addr = 8;
+                                while (mask)
+                                {
+                                        if (mask & 1)
+                                        {
+                                                uint32_t val = cmdfifo_get(voodoo);
+
+                                                voodoo_2d_reg_writel(voodoo, addr, val);
+                                        }
+
+                                        addr += 4;
+                                        mask >>= 1;
+                                }
+                                break;
+                                
+                                case 3:
+                                num = (header >> 29) & 7;
+                                mask = header;//(header >> 10) & 0xff;
+                                smode = (header >> 22) & 0xf;
+                                voodoo_reg_writel(SST_sSetupMode, ((header >> 10) & 0xff) | (smode << 16), voodoo);
+                                num_verticies = (header >> 6) & 0xf;
+                                v_num = 0;
+                                if (((header >> 3) & 7) == 2)
+                                        v_num = 1;
+//                                pclog("CMDFIFO3: num=%i verts=%i mask=%02x\n", num, num_verticies, (header >> 10) & 0xff);
+//                                pclog("CMDFIFO3 %02x %i\n", (header >> 10), (header >> 3) & 7);
+
+                                while (num_verticies--)
+                                {
+                                        voodoo->verts[3].sVx = cmdfifo_get_f(voodoo);
+                                        voodoo->verts[3].sVy = cmdfifo_get_f(voodoo);
+                                        if (mask & CMDFIFO3_PC_MASK_RGB)
+                                        {
+                                                if (header & CMDFIFO3_PC)
+                                                {
+                                                        uint32_t val = cmdfifo_get(voodoo);
+                                                        voodoo->verts[3].sBlue  = (float)(val & 0xff);
+                                                        voodoo->verts[3].sGreen = (float)((val >> 8) & 0xff);
+                                                        voodoo->verts[3].sRed   = (float)((val >> 16) & 0xff);
+                                                        voodoo->verts[3].sAlpha = (float)((val >> 24) & 0xff);
+                                                }
+                                                else
+                                                {
+                                                        voodoo->verts[3].sRed = cmdfifo_get_f(voodoo);
+                                                        voodoo->verts[3].sGreen = cmdfifo_get_f(voodoo);
+                                                        voodoo->verts[3].sBlue = cmdfifo_get_f(voodoo);
+                                                }
+                                        }
+                                        if ((mask & CMDFIFO3_PC_MASK_ALPHA) && !(header & CMDFIFO3_PC))
+                                                voodoo->verts[3].sAlpha = cmdfifo_get_f(voodoo);
+                                        if (mask & CMDFIFO3_PC_MASK_Z)
+                                                voodoo->verts[3].sVz = cmdfifo_get_f(voodoo);
+                                        if (mask & CMDFIFO3_PC_MASK_Wb)
+                                                voodoo->verts[3].sWb = cmdfifo_get_f(voodoo);
+                                        if (mask & CMDFIFO3_PC_MASK_W0)
+                                                voodoo->verts[3].sW0 = cmdfifo_get_f(voodoo);
+                                        if (mask & CMDFIFO3_PC_MASK_S0_T0)
+                                        {
+                                                voodoo->verts[3].sS0 = cmdfifo_get_f(voodoo);
+                                                voodoo->verts[3].sT0 = cmdfifo_get_f(voodoo);
+                                        }
+                                        if (mask & CMDFIFO3_PC_MASK_W1)
+                                                voodoo->verts[3].sW1 = cmdfifo_get_f(voodoo);
+                                        if (mask & CMDFIFO3_PC_MASK_S1_T1)
+                                        {
+                                                voodoo->verts[3].sS1 = cmdfifo_get_f(voodoo);
+                                                voodoo->verts[3].sT1 = cmdfifo_get_f(voodoo);
+                                        }
+                                        if (v_num)
+                                                voodoo_reg_writel(SST_sDrawTriCMD, 0, voodoo);
+                                        else
+                                                voodoo_reg_writel(SST_sBeginTriCMD, 0, voodoo);
+                                        v_num++;
+                                        if (v_num == 3 && ((header >> 3) & 7) == 0)
+                                                v_num = 0;
+                                }
+                                break;
+
+                                case 4:
+                                num = (header >> 29) & 7;
+                                mask = (header >> 15) & 0x3fff;
+                                addr = (header & 0x7ff8) >> 1;
+//                                pclog("CMDFIFO4 addr=%08x\n",addr);
+                                while (mask)
+                                {
+                                        if (mask & 1)
+                                        {
+                                                uint32_t val = cmdfifo_get(voodoo);
+
+                                                if ((addr & (1 << 13)) && voodoo->type >= VOODOO_BANSHEE)
+                                                {
+                                                        if (voodoo->type < VOODOO_BANSHEE)
+                                                                fatal("CMDFIFO1: Not Banshee\n");
+//                                                pclog("CMDFIFO1: write %08x %08x\n", addr, val);
+                                                        voodoo_2d_reg_writel(voodoo, addr, val);
+                                                }
+                                                else
+                                                {
+                                                        if ((addr & 0x3ff) == SST_triangleCMD || (addr & 0x3ff) == SST_ftriangleCMD ||
+                                                            (addr & 0x3ff) == SST_fastfillCMD || (addr & 0x3ff) == SST_nopCMD)
+                                                                voodoo->cmd_written_fifo++;
+
+                                                        if (voodoo->type >= VOODOO_BANSHEE && (addr & 0x3ff) == SST_swapbufferCMD)
+                                                                voodoo->cmd_written_fifo++;
+                                                        voodoo_reg_writel(addr, val, voodoo);
+                                                }
+                                        }
+
+                                        addr += 4;
+                                        mask >>= 1;
+                                }
+                                while (num--)
+                                        cmdfifo_get(voodoo);
+                                break;
+
+                                case 5:
+//                                if (header & 0x3fc00000)
+//                                        fatal("CMDFIFO packet 5 has byte disables set %08x\n", header);
+                                num = (header >> 3) & 0x7ffff;
+                                addr = cmdfifo_get(voodoo) & 0xffffff;
+                                if (!num)
+                                        num = 1;
+//                                pclog("CMDFIFO5 addr=%08x num=%i\n", addr, num);
+                                switch (header >> 30)
+                                {
+                                        case 0: /*Linear framebuffer (Banshee)*/
+                                        if (voodoo->texture_present[0][(addr & voodoo->texture_mask) >> TEX_DIRTY_SHIFT])
+                                        {
+//                                                pclog("texture_present at %08x %i\n", addr, (addr & voodoo->texture_mask) >> TEX_DIRTY_SHIFT);
+                                                flush_texture_cache(voodoo, addr & voodoo->texture_mask, 0);
+                                        }
+                                        if (voodoo->texture_present[1][(addr & voodoo->texture_mask) >> TEX_DIRTY_SHIFT])
+                                        {
+//                                                pclog("texture_present at %08x %i\n", addr, (addr & voodoo->texture_mask) >> TEX_DIRTY_SHIFT);
+                                                flush_texture_cache(voodoo, addr & voodoo->texture_mask, 1);
+                                        }
+                                        while (num--)
+                                        {
+                                                uint32_t val = cmdfifo_get(voodoo);
+                                                if (addr <= voodoo->fb_mask)
+                                                        *(uint32_t *)&voodoo->fb_mem[addr] = val;
+                                                addr += 4;
+                                        }
+                                        break;
+                                        case 2: /*Framebuffer*/
+                                        while (num--)
+                                        {
+                                                uint32_t val = cmdfifo_get(voodoo);
+                                                voodoo_fb_writel(addr, val, voodoo);
+                                                addr += 4;
+                                        }
+                                        break;
+                                        case 3: /*Texture*/
+                                        while (num--)
+                                        {
+                                                uint32_t val = cmdfifo_get(voodoo);
+                                                voodoo_tex_writel(addr, val, voodoo);
+                                                addr += 4;
+                                        }
+                                        break;
+
+                                        default:
+                                        fatal("CMDFIFO packet 5 bad space %08x %08x\n", header, voodoo->cmdfifo_rp);
+                                }
+                                break;
+
+                                default:
+                                fatal("Bad CMDFIFO packet %08x %08x\n", header, voodoo->cmdfifo_rp);
+                        }
+
+                        end_time = timer_read();
+                        voodoo->time += end_time - start_time;
+                }
+                voodoo->voodoo_busy = 0;
+        }
+}
diff --git a/pcem/vid_voodoo_fifo.h b/pcem/vid_voodoo_fifo.h
new file mode 100644 (file)
index 0000000..c1caeba
--- /dev/null
@@ -0,0 +1,8 @@
+void voodoo_wake_fifo_thread(voodoo_t *voodoo);
+void voodoo_wake_fifo_thread_now(voodoo_t *voodoo);
+void voodoo_wake_timer(void *p);
+void voodoo_queue_command(voodoo_t *voodoo, uint32_t addr_type, uint32_t val);
+void voodoo_flush(voodoo_t *voodoo);
+void voodoo_wake_fifo_threads(voodoo_set_t *set, voodoo_t *voodoo);
+void voodoo_wait_for_swap_complete(voodoo_t *voodoo);
+void voodoo_fifo_thread(void *param);
diff --git a/pcem/vid_voodoo_reg.cpp b/pcem/vid_voodoo_reg.cpp
new file mode 100644 (file)
index 0000000..63e590b
--- /dev/null
@@ -0,0 +1,1321 @@
+#include <math.h>
+#include <stddef.h>
+#include "ibm.h"
+#include "device.h"
+#include "mem.h"
+#include "thread.h"
+#include "video.h"
+#include "vid_svga.h"
+#include "vid_voodoo.h"
+#include "vid_voodoo_common.h"
+#include "vid_voodoo_banshee.h"
+#include "vid_voodoo_blitter.h"
+#include "vid_voodoo_dither.h"
+#include "vid_voodoo_fifo.h"
+#include "vid_voodoo_reg.h"
+#include "vid_voodoo_regs.h"
+#include "vid_voodoo_render.h"
+#include "vid_voodoo_setup.h"
+#include "vid_voodoo_texture.h"
+
+enum
+{
+        CHIP_FBI = 0x1,
+        CHIP_TREX0 = 0x2,
+        CHIP_TREX1 = 0x4,
+        CHIP_TREX2 = 0x8
+};
+
+void voodoo_reg_writel(uint32_t addr, uint32_t val, void *p)
+{
+        voodoo_t *voodoo = (voodoo_t *)p;
+        union
+        {
+                uint32_t i;
+                float f;
+        } tempif;
+        int ad21 = addr & (1 << 21);
+        int chip = (addr >> 10) & 0xf;
+        if (!chip)
+                chip = 0xf;
+
+        tempif.i = val;
+//pclog("voodoo_reg_write_l: addr=%08x val=%08x(%f) chip=%x\n", addr, val, tempif.f, chip);
+        addr &= 0x3fc;
+
+        if ((voodoo->fbiInit3 & FBIINIT3_REMAP) && addr < 0x100 && ad21)
+                addr |= 0x400;
+        switch (addr)
+        {
+                case SST_swapbufferCMD:
+                if (voodoo->type >= VOODOO_BANSHEE)
+                {
+//                        pclog("swapbufferCMD %08x %08x\n", val, voodoo->leftOverlayBuf);
+
+                        voodoo_wait_for_render_thread_idle(voodoo);
+                        if (!(val & 1))
+                        {
+                                banshee_set_overlay_addr(voodoo->p, voodoo->leftOverlayBuf);
+                                thread_lock_mutex(voodoo->swap_mutex);
+                                if (voodoo->swap_count > 0)
+                                        voodoo->swap_count--;
+                                thread_unlock_mutex(voodoo->swap_mutex);
+                                voodoo->frame_count++;
+                        }
+                        else if (TRIPLE_BUFFER)
+                        {
+                                if (voodoo->swap_pending)
+                                        voodoo_wait_for_swap_complete(voodoo);
+                                voodoo->swap_interval = (val >> 1) & 0xff;
+                                voodoo->swap_offset = voodoo->leftOverlayBuf;
+                                voodoo->swap_pending = 1;
+                        }
+                        else
+                        {
+                                voodoo->swap_interval = (val >> 1) & 0xff;
+                                voodoo->swap_offset = voodoo->leftOverlayBuf;
+                                voodoo->swap_pending = 1;
+
+                                voodoo_wait_for_swap_complete(voodoo);
+                        }
+
+                        voodoo->cmd_read++;
+                        break;
+                }
+
+                if (TRIPLE_BUFFER)
+                {
+                        voodoo->disp_buffer = (voodoo->disp_buffer + 1) % 3;
+                        voodoo->draw_buffer = (voodoo->draw_buffer + 1) % 3;
+                }
+                else
+                {
+                        voodoo->disp_buffer = !voodoo->disp_buffer;
+                        voodoo->draw_buffer = !voodoo->draw_buffer;
+                }
+                voodoo_recalc(voodoo);
+
+                voodoo->params.swapbufferCMD = val;
+
+//                pclog("Swap buffer %08x %d %p %i\n", val, voodoo->swap_count, &voodoo->swap_count, (voodoo == voodoo->set->voodoos[1]) ? 1 : 0);
+//                voodoo->front_offset = params->front_offset;
+                voodoo_wait_for_render_thread_idle(voodoo);
+                if (!(val & 1))
+                {
+                        memset(voodoo->dirty_line, 1, sizeof(voodoo->dirty_line));
+                        voodoo->front_offset = voodoo->params.front_offset;
+                        thread_lock_mutex(voodoo->swap_mutex);
+                        if (voodoo->swap_count > 0)
+                                voodoo->swap_count--;
+                        thread_unlock_mutex(voodoo->swap_mutex);
+                }
+                else if (TRIPLE_BUFFER)
+                {
+                        if (voodoo->swap_pending)
+                                voodoo_wait_for_swap_complete(voodoo);
+
+                        voodoo->swap_interval = (val >> 1) & 0xff;
+                        voodoo->swap_offset = voodoo->params.front_offset;
+                        voodoo->swap_pending = 1;
+                }
+                else
+                {
+                        voodoo->swap_interval = (val >> 1) & 0xff;
+                        voodoo->swap_offset = voodoo->params.front_offset;
+                        voodoo->swap_pending = 1;
+
+                        voodoo_wait_for_swap_complete(voodoo);
+                }
+                voodoo->cmd_read++;
+                break;
+
+                case SST_vertexAx: case SST_remap_vertexAx:
+                voodoo->params.vertexAx = val & 0xffff;
+                break;
+                case SST_vertexAy: case SST_remap_vertexAy:
+                voodoo->params.vertexAy = val & 0xffff;
+                break;
+                case SST_vertexBx: case SST_remap_vertexBx:
+                voodoo->params.vertexBx = val & 0xffff;
+                break;
+                case SST_vertexBy: case SST_remap_vertexBy:
+                voodoo->params.vertexBy = val & 0xffff;
+                break;
+                case SST_vertexCx: case SST_remap_vertexCx:
+                voodoo->params.vertexCx = val & 0xffff;
+                break;
+                case SST_vertexCy: case SST_remap_vertexCy:
+                voodoo->params.vertexCy = val & 0xffff;
+                break;
+
+                case SST_startR: case SST_remap_startR:
+                voodoo->params.startR = val & 0xffffff;
+                break;
+                case SST_startG: case SST_remap_startG:
+                voodoo->params.startG = val & 0xffffff;
+                break;
+                case SST_startB: case SST_remap_startB:
+                voodoo->params.startB = val & 0xffffff;
+                break;
+                case SST_startZ: case SST_remap_startZ:
+                voodoo->params.startZ = val;
+                break;
+                case SST_startA: case SST_remap_startA:
+                voodoo->params.startA = val & 0xffffff;
+                break;
+                case SST_startS: case SST_remap_startS:
+                if (chip & CHIP_TREX0)
+                        voodoo->params.tmu[0].startS = ((int64_t)(int32_t)val) << 14;
+                if (chip & CHIP_TREX1)
+                        voodoo->params.tmu[1].startS = ((int64_t)(int32_t)val) << 14;
+                break;
+                case SST_startT: case SST_remap_startT:
+                if (chip & CHIP_TREX0)
+                        voodoo->params.tmu[0].startT = ((int64_t)(int32_t)val) << 14;
+                if (chip & CHIP_TREX1)
+                        voodoo->params.tmu[1].startT = ((int64_t)(int32_t)val) << 14;
+                break;
+                case SST_startW: case SST_remap_startW:
+                if (chip & CHIP_FBI)
+                        voodoo->params.startW = (int64_t)(int32_t)val << 2;
+                if (chip & CHIP_TREX0)
+                        voodoo->params.tmu[0].startW = (int64_t)(int32_t)val << 2;
+                if (chip & CHIP_TREX1)
+                        voodoo->params.tmu[1].startW = (int64_t)(int32_t)val << 2;
+                break;
+
+                case SST_dRdX: case SST_remap_dRdX:
+                voodoo->params.dRdX = (val & 0xffffff) | ((val & 0x800000) ? 0xff000000 : 0);
+                break;
+                case SST_dGdX: case SST_remap_dGdX:
+                voodoo->params.dGdX = (val & 0xffffff) | ((val & 0x800000) ? 0xff000000 : 0);
+                break;
+                case SST_dBdX: case SST_remap_dBdX:
+                voodoo->params.dBdX = (val & 0xffffff) | ((val & 0x800000) ? 0xff000000 : 0);
+                break;
+                case SST_dZdX: case SST_remap_dZdX:
+                voodoo->params.dZdX = val;
+                break;
+                case SST_dAdX: case SST_remap_dAdX:
+                voodoo->params.dAdX = (val & 0xffffff) | ((val & 0x800000) ? 0xff000000 : 0);
+                break;
+                case SST_dSdX: case SST_remap_dSdX:
+                if (chip & CHIP_TREX0)
+                        voodoo->params.tmu[0].dSdX = ((int64_t)(int32_t)val) << 14;
+                if (chip & CHIP_TREX1)
+                        voodoo->params.tmu[1].dSdX = ((int64_t)(int32_t)val) << 14;
+                break;
+                case SST_dTdX: case SST_remap_dTdX:
+                if (chip & CHIP_TREX0)
+                        voodoo->params.tmu[0].dTdX = ((int64_t)(int32_t)val) << 14;
+                if (chip & CHIP_TREX1)
+                        voodoo->params.tmu[1].dTdX = ((int64_t)(int32_t)val) << 14;
+                break;
+                case SST_dWdX: case SST_remap_dWdX:
+                if (chip & CHIP_TREX0)
+                        voodoo->params.tmu[0].dWdX = (int64_t)(int32_t)val << 2;
+                if (chip & CHIP_TREX1)
+                        voodoo->params.tmu[1].dWdX = (int64_t)(int32_t)val << 2;
+                if (chip & CHIP_FBI)
+                        voodoo->params.dWdX = (int64_t)(int32_t)val << 2;
+                break;
+
+                case SST_dRdY: case SST_remap_dRdY:
+                voodoo->params.dRdY = (val & 0xffffff) | ((val & 0x800000) ? 0xff000000 : 0);
+                break;
+                case SST_dGdY: case SST_remap_dGdY:
+                voodoo->params.dGdY = (val & 0xffffff) | ((val & 0x800000) ? 0xff000000 : 0);
+                break;
+                case SST_dBdY: case SST_remap_dBdY:
+                voodoo->params.dBdY = (val & 0xffffff) | ((val & 0x800000) ? 0xff000000 : 0);
+                break;
+                case SST_dZdY: case SST_remap_dZdY:
+                voodoo->params.dZdY = val;
+                break;
+                case SST_dAdY: case SST_remap_dAdY:
+                voodoo->params.dAdY = (val & 0xffffff) | ((val & 0x800000) ? 0xff000000 : 0);
+                break;
+                case SST_dSdY: case SST_remap_dSdY:
+                if (chip & CHIP_TREX0)
+                        voodoo->params.tmu[0].dSdY = ((int64_t)(int32_t)val) << 14;
+                if (chip & CHIP_TREX1)
+                        voodoo->params.tmu[1].dSdY = ((int64_t)(int32_t)val) << 14;
+                break;
+                case SST_dTdY: case SST_remap_dTdY:
+                if (chip & CHIP_TREX0)
+                        voodoo->params.tmu[0].dTdY = ((int64_t)(int32_t)val) << 14;
+                if (chip & CHIP_TREX1)
+                        voodoo->params.tmu[1].dTdY = ((int64_t)(int32_t)val) << 14;
+                break;
+                case SST_dWdY: case SST_remap_dWdY:
+                if (chip & CHIP_TREX0)
+                        voodoo->params.tmu[0].dWdY = (int64_t)(int32_t)val << 2;
+                if (chip & CHIP_TREX1)
+                        voodoo->params.tmu[1].dWdY = (int64_t)(int32_t)val << 2;
+                if (chip & CHIP_FBI)
+                        voodoo->params.dWdY = (int64_t)(int32_t)val << 2;
+                break;
+
+                case SST_triangleCMD: case SST_remap_triangleCMD:
+                voodoo->params.sign = val & (1 << 31);
+
+                if (voodoo->ncc_dirty[0])
+                        voodoo_update_ncc(voodoo, 0);
+                if (voodoo->ncc_dirty[1])
+                        voodoo_update_ncc(voodoo, 1);
+                voodoo->ncc_dirty[0] = voodoo->ncc_dirty[1] = 0;
+
+                voodoo_queue_triangle(voodoo, &voodoo->params);
+
+                voodoo->cmd_read++;
+                break;
+
+                case SST_fvertexAx: case SST_remap_fvertexAx:
+                voodoo->fvertexAx.i = val;
+                voodoo->params.vertexAx = (int32_t)(int16_t)(int32_t)(voodoo->fvertexAx.f * 16.0f) & 0xffff;
+                break;
+                case SST_fvertexAy: case SST_remap_fvertexAy:
+                voodoo->fvertexAy.i = val;
+                voodoo->params.vertexAy = (int32_t)(int16_t)(int32_t)(voodoo->fvertexAy.f * 16.0f) & 0xffff;
+                break;
+                case SST_fvertexBx: case SST_remap_fvertexBx:
+                voodoo->fvertexBx.i = val;
+                voodoo->params.vertexBx = (int32_t)(int16_t)(int32_t)(voodoo->fvertexBx.f * 16.0f) & 0xffff;
+                break;
+                case SST_fvertexBy: case SST_remap_fvertexBy:
+                voodoo->fvertexBy.i = val;
+                voodoo->params.vertexBy = (int32_t)(int16_t)(int32_t)(voodoo->fvertexBy.f * 16.0f) & 0xffff;
+                break;
+                case SST_fvertexCx: case SST_remap_fvertexCx:
+                voodoo->fvertexCx.i = val;
+                voodoo->params.vertexCx = (int32_t)(int16_t)(int32_t)(voodoo->fvertexCx.f * 16.0f) & 0xffff;
+                break;
+                case SST_fvertexCy: case SST_remap_fvertexCy:
+                voodoo->fvertexCy.i = val;
+                voodoo->params.vertexCy = (int32_t)(int16_t)(int32_t)(voodoo->fvertexCy.f * 16.0f) & 0xffff;
+                break;
+
+                case SST_fstartR: case SST_remap_fstartR:
+                tempif.i = val;
+                voodoo->params.startR = (int32_t)(tempif.f * 4096.0f);
+                break;
+                case SST_fstartG: case SST_remap_fstartG:
+                tempif.i = val;
+                voodoo->params.startG = (int32_t)(tempif.f * 4096.0f);
+                break;
+                case SST_fstartB: case SST_remap_fstartB:
+                tempif.i = val;
+                voodoo->params.startB = (int32_t)(tempif.f * 4096.0f);
+                break;
+                case SST_fstartZ: case SST_remap_fstartZ:
+                tempif.i = val;
+                voodoo->params.startZ = (int32_t)(tempif.f * 4096.0f);
+                break;
+                case SST_fstartA: case SST_remap_fstartA:
+                tempif.i = val;
+                voodoo->params.startA = (int32_t)(tempif.f * 4096.0f);
+                break;
+                case SST_fstartS: case SST_remap_fstartS:
+                tempif.i = val;
+                if (chip & CHIP_TREX0)
+                        voodoo->params.tmu[0].startS = (int64_t)(tempif.f * 4294967296.0f);
+                if (chip & CHIP_TREX1)
+                        voodoo->params.tmu[1].startS = (int64_t)(tempif.f * 4294967296.0f);
+                break;
+                case SST_fstartT: case SST_remap_fstartT:
+                tempif.i = val;
+                if (chip & CHIP_TREX0)
+                        voodoo->params.tmu[0].startT = (int64_t)(tempif.f * 4294967296.0f);
+                if (chip & CHIP_TREX1)
+                        voodoo->params.tmu[1].startT = (int64_t)(tempif.f * 4294967296.0f);
+                break;
+                case SST_fstartW: case SST_remap_fstartW:
+                tempif.i = val;
+                if (chip & CHIP_TREX0)
+                        voodoo->params.tmu[0].startW = (int64_t)(tempif.f * 4294967296.0f);
+                if (chip & CHIP_TREX1)
+                        voodoo->params.tmu[1].startW = (int64_t)(tempif.f * 4294967296.0f);
+                if (chip & CHIP_FBI)
+                        voodoo->params.startW = (int64_t)(tempif.f * 4294967296.0f);
+                break;
+
+                case SST_fdRdX: case SST_remap_fdRdX:
+                tempif.i = val;
+                voodoo->params.dRdX = (int32_t)(tempif.f * 4096.0f);
+                break;
+                case SST_fdGdX: case SST_remap_fdGdX:
+                tempif.i = val;
+                voodoo->params.dGdX = (int32_t)(tempif.f * 4096.0f);
+                break;
+                case SST_fdBdX: case SST_remap_fdBdX:
+                tempif.i = val;
+                voodoo->params.dBdX = (int32_t)(tempif.f * 4096.0f);
+                break;
+                case SST_fdZdX: case SST_remap_fdZdX:
+                tempif.i = val;
+                voodoo->params.dZdX = (int32_t)(tempif.f * 4096.0f);
+                break;
+                case SST_fdAdX: case SST_remap_fdAdX:
+                tempif.i = val;
+                voodoo->params.dAdX = (int32_t)(tempif.f * 4096.0f);
+                break;
+                case SST_fdSdX: case SST_remap_fdSdX:
+                tempif.i = val;
+                if (chip & CHIP_TREX0)
+                        voodoo->params.tmu[0].dSdX = (int64_t)(tempif.f * 4294967296.0f);
+                if (chip & CHIP_TREX1)
+                        voodoo->params.tmu[1].dSdX = (int64_t)(tempif.f * 4294967296.0f);
+                break;
+                case SST_fdTdX: case SST_remap_fdTdX:
+                tempif.i = val;
+                if (chip & CHIP_TREX0)
+                        voodoo->params.tmu[0].dTdX = (int64_t)(tempif.f * 4294967296.0f);
+                if (chip & CHIP_TREX1)
+                        voodoo->params.tmu[1].dTdX = (int64_t)(tempif.f * 4294967296.0f);
+                break;
+                case SST_fdWdX: case SST_remap_fdWdX:
+                tempif.i = val;
+                if (chip & CHIP_TREX0)
+                        voodoo->params.tmu[0].dWdX = (int64_t)(tempif.f * 4294967296.0f);
+                if (chip & CHIP_TREX1)
+                        voodoo->params.tmu[1].dWdX = (int64_t)(tempif.f * 4294967296.0f);
+                if (chip & CHIP_FBI)
+                        voodoo->params.dWdX = (int64_t)(tempif.f * 4294967296.0f);
+                break;
+
+                case SST_fdRdY: case SST_remap_fdRdY:
+                tempif.i = val;
+                voodoo->params.dRdY = (int32_t)(tempif.f * 4096.0f);
+                break;
+                case SST_fdGdY: case SST_remap_fdGdY:
+                tempif.i = val;
+                voodoo->params.dGdY = (int32_t)(tempif.f * 4096.0f);
+                break;
+                case SST_fdBdY: case SST_remap_fdBdY:
+                tempif.i = val;
+                voodoo->params.dBdY = (int32_t)(tempif.f * 4096.0f);
+                break;
+                case SST_fdZdY: case SST_remap_fdZdY:
+                tempif.i = val;
+                voodoo->params.dZdY = (int32_t)(tempif.f * 4096.0f);
+                break;
+                case SST_fdAdY: case SST_remap_fdAdY:
+                tempif.i = val;
+                voodoo->params.dAdY = (int32_t)(tempif.f * 4096.0f);
+                break;
+                case SST_fdSdY: case SST_remap_fdSdY:
+                tempif.i = val;
+                if (chip & CHIP_TREX0)
+                        voodoo->params.tmu[0].dSdY = (int64_t)(tempif.f * 4294967296.0f);
+                if (chip & CHIP_TREX1)
+                        voodoo->params.tmu[1].dSdY = (int64_t)(tempif.f * 4294967296.0f);
+                break;
+                case SST_fdTdY: case SST_remap_fdTdY:
+                tempif.i = val;
+                if (chip & CHIP_TREX0)
+                        voodoo->params.tmu[0].dTdY = (int64_t)(tempif.f * 4294967296.0f);
+                if (chip & CHIP_TREX1)
+                        voodoo->params.tmu[1].dTdY = (int64_t)(tempif.f * 4294967296.0f);
+                break;
+                case SST_fdWdY: case SST_remap_fdWdY:
+                tempif.i = val;
+                if (chip & CHIP_TREX0)
+                        voodoo->params.tmu[0].dWdY = (int64_t)(tempif.f * 4294967296.0f);
+                if (chip & CHIP_TREX1)
+                        voodoo->params.tmu[1].dWdY = (int64_t)(tempif.f * 4294967296.0f);
+                if (chip & CHIP_FBI)
+                        voodoo->params.dWdY = (int64_t)(tempif.f * 4294967296.0f);
+                break;
+
+                case SST_ftriangleCMD:
+                voodoo->params.sign = val & (1 << 31);
+
+                if (voodoo->ncc_dirty[0])
+                        voodoo_update_ncc(voodoo, 0);
+                if (voodoo->ncc_dirty[1])
+                        voodoo_update_ncc(voodoo, 1);
+                voodoo->ncc_dirty[0] = voodoo->ncc_dirty[1] = 0;
+
+                voodoo_queue_triangle(voodoo, &voodoo->params);
+
+                voodoo->cmd_read++;
+                break;
+
+                case SST_fbzColorPath:
+                voodoo->params.fbzColorPath = val;
+                voodoo->rgb_sel = val & 3;
+                break;
+
+                case SST_fogMode:
+                voodoo->params.fogMode = val;
+                break;
+                case SST_alphaMode:
+                voodoo->params.alphaMode = val;
+                break;
+                case SST_fbzMode:
+                voodoo->params.fbzMode = val;
+                voodoo_recalc(voodoo);
+                break;
+                case SST_lfbMode:
+                voodoo->lfbMode = val;
+                voodoo_recalc(voodoo);
+                break;
+
+                case SST_clipLeftRight:
+                if (voodoo->type >= VOODOO_2)
+                {
+                        voodoo->params.clipRight = val & 0xfff;
+                        voodoo->params.clipLeft = (val >> 16) & 0xfff;
+                }
+                else
+                {
+                        voodoo->params.clipRight = val & 0x3ff;
+                        voodoo->params.clipLeft = (val >> 16) & 0x3ff;
+                }
+                break;
+                case SST_clipLowYHighY:
+                if (voodoo->type >= VOODOO_2)
+                {
+                        voodoo->params.clipHighY = val & 0xfff;
+                        voodoo->params.clipLowY = (val >> 16) & 0xfff;
+                }
+                else
+                {
+                        voodoo->params.clipHighY = val & 0x3ff;
+                        voodoo->params.clipLowY = (val >> 16) & 0x3ff;
+                }
+                break;
+
+                case SST_nopCMD:
+                voodoo->cmd_read++;
+                voodoo->fbiPixelsIn = 0;
+                voodoo->fbiChromaFail = 0;
+                voodoo->fbiZFuncFail = 0;
+                voodoo->fbiAFuncFail = 0;
+                voodoo->fbiPixelsOut = 0;
+                break;
+                case SST_fastfillCMD:
+                voodoo_wait_for_render_thread_idle(voodoo);
+                voodoo_fastfill(voodoo, &voodoo->params);
+                voodoo->cmd_read++;
+                break;
+
+                case SST_fogColor:
+                voodoo->params.fogColor.r = (val >> 16) & 0xff;
+                voodoo->params.fogColor.g = (val >> 8) & 0xff;
+                voodoo->params.fogColor.b = val & 0xff;
+                break;
+
+                case SST_zaColor:
+                voodoo->params.zaColor = val;
+                break;
+                case SST_chromaKey:
+                voodoo->params.chromaKey_r = (val >> 16) & 0xff;
+                voodoo->params.chromaKey_g = (val >> 8) & 0xff;
+                voodoo->params.chromaKey_b = val & 0xff;
+                voodoo->params.chromaKey = val & 0xffffff;
+                break;
+                case SST_stipple:
+                voodoo->params.stipple = val;
+                break;
+                case SST_color0:
+                voodoo->params.color0 = val;
+                break;
+                case SST_color1:
+                voodoo->params.color1 = val;
+                break;
+
+                case SST_fogTable00: case SST_fogTable01: case SST_fogTable02: case SST_fogTable03:
+                case SST_fogTable04: case SST_fogTable05: case SST_fogTable06: case SST_fogTable07:
+                case SST_fogTable08: case SST_fogTable09: case SST_fogTable0a: case SST_fogTable0b:
+                case SST_fogTable0c: case SST_fogTable0d: case SST_fogTable0e: case SST_fogTable0f:
+                case SST_fogTable10: case SST_fogTable11: case SST_fogTable12: case SST_fogTable13:
+                case SST_fogTable14: case SST_fogTable15: case SST_fogTable16: case SST_fogTable17:
+                case SST_fogTable18: case SST_fogTable19: case SST_fogTable1a: case SST_fogTable1b:
+                case SST_fogTable1c: case SST_fogTable1d: case SST_fogTable1e: case SST_fogTable1f:
+                addr = (addr - SST_fogTable00) >> 1;
+                voodoo->params.fogTable[addr].dfog   = val & 0xff;
+                voodoo->params.fogTable[addr].fog    = (val >> 8) & 0xff;
+                voodoo->params.fogTable[addr+1].dfog = (val >> 16) & 0xff;
+                voodoo->params.fogTable[addr+1].fog  = (val >> 24) & 0xff;
+                break;
+                
+                case SST_clipLeftRight1:
+                if (voodoo->type >= VOODOO_BANSHEE)
+                {
+                        voodoo->params.clipRight1 = val & 0xfff;
+                        voodoo->params.clipLeft1 = (val >> 16) & 0xfff;
+                }
+                break;
+                case SST_clipTopBottom1:
+                if (voodoo->type >= VOODOO_BANSHEE)
+                {
+                        voodoo->params.clipHighY1 = val & 0xfff;
+                        voodoo->params.clipLowY1 = (val >> 16) & 0xfff;
+                }
+                break;
+
+                case SST_colBufferAddr:
+                if (voodoo->type >= VOODOO_BANSHEE)
+                {
+                        voodoo->params.draw_offset = val & 0xfffff0;
+                        voodoo->fb_write_offset = voodoo->params.draw_offset;
+//                        pclog("colorBufferAddr=%06x\n", voodoo->params.draw_offset);
+                }
+                break;
+                case SST_colBufferStride:
+                if (voodoo->type >= VOODOO_BANSHEE)
+                {
+                        voodoo->col_tiled = val & (1 << 15);
+                        voodoo->params.col_tiled = voodoo->col_tiled;
+                        if (voodoo->col_tiled)
+                        {
+                                voodoo->row_width = (val & 0x7f) * 128*32;
+//                                pclog("colBufferStride tiled = %i bytes, tiled  %08x\n", voodoo->row_width, val);
+                        }
+                        else
+                        {
+                                voodoo->row_width = val & 0x3fff;
+//                                pclog("colBufferStride linear = %i bytes, linear\n", voodoo->row_width);
+                        }
+                        voodoo->params.row_width = voodoo->row_width;
+                }
+                break;
+                case SST_auxBufferAddr:
+                if (voodoo->type >= VOODOO_BANSHEE)
+                {
+                        voodoo->params.aux_offset = val & 0xfffff0;
+//                        pclog("auxBufferAddr=%06x\n", voodoo->params.aux_offset);
+                }
+                break;
+                case SST_auxBufferStride:
+                if (voodoo->type >= VOODOO_BANSHEE)
+                {
+                        voodoo->aux_tiled = val & (1 << 15);
+                        voodoo->params.aux_tiled = voodoo->aux_tiled;
+                        if (voodoo->aux_tiled)
+                        {
+                                voodoo->aux_row_width = (val & 0x7f) * 128*32;
+//                                pclog("auxBufferStride tiled = %i bytes, tiled\n", voodoo->aux_row_width);
+                        }
+                        else
+                        {
+                                voodoo->aux_row_width = val & 0x3fff;
+//                                pclog("auxBufferStride linear = %i bytes, linear\n", voodoo->aux_row_width);
+                        }
+                        voodoo->params.aux_row_width = voodoo->aux_row_width;
+                }
+                break;
+
+                case SST_clutData:
+                voodoo->clutData[(val >> 24) & 0x3f].b = val & 0xff;
+                voodoo->clutData[(val >> 24) & 0x3f].g = (val >> 8) & 0xff;
+                voodoo->clutData[(val >> 24) & 0x3f].r = (val >> 16) & 0xff;
+                if (val & 0x20000000)
+                {
+                        voodoo->clutData[(val >> 24) & 0x3f].b = 255;
+                        voodoo->clutData[(val >> 24) & 0x3f].g = 255;
+                        voodoo->clutData[(val >> 24) & 0x3f].r = 255;
+                }
+                voodoo->clutData_dirty = 1;
+                break;
+
+                case SST_sSetupMode:
+                voodoo->sSetupMode = val;
+                break;
+                case SST_sVx:
+                tempif.i = val;
+                voodoo->verts[3].sVx = tempif.f;
+//                pclog("sVx[%i]=%f\n", voodoo->vertex_num, tempif.f);
+                break;
+                case SST_sVy:
+                tempif.i = val;
+                voodoo->verts[3].sVy = tempif.f;
+//                pclog("sVy[%i]=%f\n", voodoo->vertex_num, tempif.f);
+                break;
+                case SST_sARGB:
+                voodoo->verts[3].sBlue  = (float)(val & 0xff);
+                voodoo->verts[3].sGreen = (float)((val >> 8) & 0xff);
+                voodoo->verts[3].sRed   = (float)((val >> 16) & 0xff);
+                voodoo->verts[3].sAlpha = (float)((val >> 24) & 0xff);
+                break;
+                case SST_sRed:
+                tempif.i = val;
+                voodoo->verts[3].sRed = tempif.f;
+                break;
+                case SST_sGreen:
+                tempif.i = val;
+                voodoo->verts[3].sGreen = tempif.f;
+                break;
+                case SST_sBlue:
+                tempif.i = val;
+                voodoo->verts[3].sBlue = tempif.f;
+                break;
+                case SST_sAlpha:
+                tempif.i = val;
+                voodoo->verts[3].sAlpha = tempif.f;
+                break;
+                case SST_sVz:
+                tempif.i = val;
+                voodoo->verts[3].sVz = tempif.f;
+                break;
+                case SST_sWb:
+                tempif.i = val;
+                voodoo->verts[3].sWb = tempif.f;
+                break;
+                case SST_sW0:
+                tempif.i = val;
+                voodoo->verts[3].sW0 = tempif.f;
+                break;
+                case SST_sS0:
+                tempif.i = val;
+                voodoo->verts[3].sS0 = tempif.f;
+                break;
+                case SST_sT0:
+                tempif.i = val;
+                voodoo->verts[3].sT0 = tempif.f;
+                break;
+                case SST_sW1:
+                tempif.i = val;
+                voodoo->verts[3].sW1 = tempif.f;
+                break;
+                case SST_sS1:
+                tempif.i = val;
+                voodoo->verts[3].sS1 = tempif.f;
+                break;
+                case SST_sT1:
+                tempif.i = val;
+                voodoo->verts[3].sT1 = tempif.f;
+                break;
+
+                case SST_sBeginTriCMD:
+//                pclog("sBeginTriCMD %i %f\n", voodoo->vertex_num, voodoo->verts[4].sVx);
+                voodoo->verts[0] = voodoo->verts[3];
+                voodoo->verts[1] = voodoo->verts[3];
+                voodoo->verts[2] = voodoo->verts[3];
+                voodoo->vertex_next_age = 0;
+                voodoo->vertex_ages[0] = voodoo->vertex_next_age++;
+
+                voodoo->num_verticies = 1;
+                voodoo->cull_pingpong = 0;
+                break;
+                case SST_sDrawTriCMD:
+//                pclog("sDrawTriCMD %i %i\n", voodoo->num_verticies, voodoo->sSetupMode & SETUPMODE_STRIP_MODE);
+                /*I'm not sure this is the vertex selection algorithm actually used in the 3dfx
+                  chips, but this works with a number of games that switch between strip and fan
+                  mode in the middle of a run (eg Black & White, Viper Racing)*/
+                if (voodoo->vertex_next_age < 3)
+                {
+                        /*Fewer than three vertices already written, store in next slot*/
+                        int vertex_nr = voodoo->vertex_next_age;
+
+                        voodoo->verts[vertex_nr] = voodoo->verts[3];
+                        voodoo->vertex_ages[vertex_nr] = voodoo->vertex_next_age++;
+                }
+                else
+                {
+                        int vertex_nr = 0;
+
+                        if (!(voodoo->sSetupMode & SETUPMODE_STRIP_MODE))
+                        {
+                                /*Strip - find oldest vertex*/
+                                if ((voodoo->vertex_ages[0] < voodoo->vertex_ages[1]) &&
+                                    (voodoo->vertex_ages[0] < voodoo->vertex_ages[2]))
+                                        vertex_nr = 0;
+                                else if ((voodoo->vertex_ages[1] < voodoo->vertex_ages[0]) &&
+                                    (voodoo->vertex_ages[1] < voodoo->vertex_ages[2]))
+                                        vertex_nr = 1;
+                                else
+                                        vertex_nr = 2;
+                        }
+                        else
+                        {
+                                /*Fan - find second oldest vertex (ie pivot around oldest)*/
+                                if ((voodoo->vertex_ages[1] < voodoo->vertex_ages[0]) &&
+                                    (voodoo->vertex_ages[0] < voodoo->vertex_ages[2]))
+                                        vertex_nr = 0;
+                                else if ((voodoo->vertex_ages[2] < voodoo->vertex_ages[0]) &&
+                                    (voodoo->vertex_ages[0] < voodoo->vertex_ages[1]))
+                                        vertex_nr = 0;
+                                else if ((voodoo->vertex_ages[0] < voodoo->vertex_ages[1]) &&
+                                    (voodoo->vertex_ages[1] < voodoo->vertex_ages[2]))
+                                        vertex_nr = 1;
+                                else if ((voodoo->vertex_ages[2] < voodoo->vertex_ages[1]) &&
+                                    (voodoo->vertex_ages[1] < voodoo->vertex_ages[0]))
+                                        vertex_nr = 1;
+                                else
+                                        vertex_nr = 2;
+                        }
+                        voodoo->verts[vertex_nr] = voodoo->verts[3];
+                        voodoo->vertex_ages[vertex_nr] = voodoo->vertex_next_age++;
+                }
+
+                voodoo->num_verticies++;
+                if (voodoo->num_verticies == 3)
+                {
+//                        pclog("triangle_setup\n");
+                        voodoo_triangle_setup(voodoo);
+                        voodoo->cull_pingpong = !voodoo->cull_pingpong;
+
+                        voodoo->num_verticies = 2;
+                }
+                break;
+
+                case SST_bltSrcBaseAddr:
+                voodoo->bltSrcBaseAddr = val & 0x3fffff;
+                break;
+                case SST_bltDstBaseAddr:
+//                pclog("Write bltDstBaseAddr %08x\n", val);
+                voodoo->bltDstBaseAddr = val & 0x3fffff;
+                break;
+                case SST_bltXYStrides:
+                voodoo->bltSrcXYStride = val & 0xfff;
+                voodoo->bltDstXYStride = (val >> 16) & 0xfff;
+//                pclog("Write bltXYStrides %08x\n", val);
+                break;
+                case SST_bltSrcChromaRange:
+                voodoo->bltSrcChromaRange = val;
+                voodoo->bltSrcChromaMinB = val & 0x1f;
+                voodoo->bltSrcChromaMinG = (val >> 5) & 0x3f;
+                voodoo->bltSrcChromaMinR = (val >> 11) & 0x1f;
+                voodoo->bltSrcChromaMaxB = (val >> 16) & 0x1f;
+                voodoo->bltSrcChromaMaxG = (val >> 21) & 0x3f;
+                voodoo->bltSrcChromaMaxR = (val >> 27) & 0x1f;
+                break;
+                case SST_bltDstChromaRange:
+                voodoo->bltDstChromaRange = val;
+                voodoo->bltDstChromaMinB = val & 0x1f;
+                voodoo->bltDstChromaMinG = (val >> 5) & 0x3f;
+                voodoo->bltDstChromaMinR = (val >> 11) & 0x1f;
+                voodoo->bltDstChromaMaxB = (val >> 16) & 0x1f;
+                voodoo->bltDstChromaMaxG = (val >> 21) & 0x3f;
+                voodoo->bltDstChromaMaxR = (val >> 27) & 0x1f;
+                break;
+                case SST_bltClipX:
+                voodoo->bltClipRight = val & 0xfff;
+                voodoo->bltClipLeft = (val >> 16) & 0xfff;
+                break;
+                case SST_bltClipY:
+                voodoo->bltClipHighY = val & 0xfff;
+                voodoo->bltClipLowY = (val >> 16) & 0xfff;
+                break;
+
+                case SST_bltSrcXY:
+                voodoo->bltSrcX = val & 0x7ff;
+                voodoo->bltSrcY = (val >> 16) & 0x7ff;
+                break;
+                case SST_bltDstXY:
+//                pclog("Write bltDstXY %08x\n", val);
+                voodoo->bltDstX = val & 0x7ff;
+                voodoo->bltDstY = (val >> 16) & 0x7ff;
+                if (val & (1 << 31))
+                        voodoo_v2_blit_start(voodoo);
+                break;
+                case SST_bltSize:
+//                pclog("Write bltSize %08x\n", val);
+                voodoo->bltSizeX = val & 0xfff;
+                if (voodoo->bltSizeX & 0x800)
+                        voodoo->bltSizeX |= 0xfffff000;
+                voodoo->bltSizeY = (val >> 16) & 0xfff;
+                if (voodoo->bltSizeY & 0x800)
+                        voodoo->bltSizeY |= 0xfffff000;
+                if (val & (1 << 31))
+                        voodoo_v2_blit_start(voodoo);
+                break;
+                case SST_bltRop:
+                voodoo->bltRop[0] = val & 0xf;
+                voodoo->bltRop[1] = (val >> 4) & 0xf;
+                voodoo->bltRop[2] = (val >> 8) & 0xf;
+                voodoo->bltRop[3] = (val >> 12) & 0xf;
+                break;
+                case SST_bltColor:
+//                pclog("Write bltColor %08x\n", val);
+                voodoo->bltColorFg = val & 0xffff;
+                voodoo->bltColorBg = (val >> 16) & 0xffff;
+                break;
+
+                case SST_bltCommand:
+                voodoo->bltCommand = val;
+//                pclog("Write bltCommand %08x\n", val);
+                if (val & (1 << 31))
+                        voodoo_v2_blit_start(voodoo);
+                break;
+                case SST_bltData:
+                voodoo_v2_blit_data(voodoo, val);
+                break;
+
+                case SST_textureMode:
+                if (chip & CHIP_TREX0)
+                {
+                        voodoo->params.textureMode[0] = val;
+                        voodoo->params.tformat[0] = (val >> 8) & 0xf;
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        voodoo->params.textureMode[1] = val;
+                        voodoo->params.tformat[1] = (val >> 8) & 0xf;
+                }
+                break;
+                case SST_tLOD:
+                if (chip & CHIP_TREX0)
+                {
+                        voodoo->params.tLOD[0] = val;
+                        voodoo_recalc_tex(voodoo, 0);
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        voodoo->params.tLOD[1] = val;
+                        voodoo_recalc_tex(voodoo, 1);
+                }
+                break;
+                case SST_tDetail:
+                if (chip & CHIP_TREX0)
+                {
+                        voodoo->params.detail_max[0] = val & 0xff;
+                        voodoo->params.detail_bias[0] = (val >> 8) & 0x3f;
+                        voodoo->params.detail_scale[0] = (val >> 14) & 7;
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        voodoo->params.detail_max[1] = val & 0xff;
+                        voodoo->params.detail_bias[1] = (val >> 8) & 0x3f;
+                        voodoo->params.detail_scale[1] = (val >> 14) & 7;
+                }
+                break;
+                case SST_texBaseAddr:
+                if (chip & CHIP_TREX0)
+                {
+                        if (voodoo->type >= VOODOO_BANSHEE)
+                                voodoo->params.texBaseAddr[0] = val & 0xfffff0;
+                        else
+                                voodoo->params.texBaseAddr[0] = (val & 0x7ffff) << 3;
+//                        pclog("texBaseAddr = %08x %08x\n", voodoo->params.texBaseAddr[0], val);
+                        voodoo_recalc_tex(voodoo, 0);
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        if (voodoo->type >= VOODOO_BANSHEE)
+                                voodoo->params.texBaseAddr[1] = val & 0xfffff0;
+                        else
+                                voodoo->params.texBaseAddr[1] = (val & 0x7ffff) << 3;
+                        voodoo_recalc_tex(voodoo, 1);
+                }
+                break;
+                case SST_texBaseAddr1:
+                if (chip & CHIP_TREX0)
+                {
+                        if (voodoo->type >= VOODOO_BANSHEE)
+                                voodoo->params.texBaseAddr1[0] = val & 0xfffff0;
+                        else
+                                voodoo->params.texBaseAddr1[0] = (val & 0x7ffff) << 3;
+                        voodoo_recalc_tex(voodoo, 0);
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        if (voodoo->type >= VOODOO_BANSHEE)
+                                voodoo->params.texBaseAddr1[1] = val & 0xfffff0;
+                        else
+                                voodoo->params.texBaseAddr1[1] = (val & 0x7ffff) << 3;
+                        voodoo_recalc_tex(voodoo, 1);
+                }
+                break;
+                case SST_texBaseAddr2:
+                if (chip & CHIP_TREX0)
+                {
+                        if (voodoo->type >= VOODOO_BANSHEE)
+                                voodoo->params.texBaseAddr2[0] = val & 0xfffff0;
+                        else
+                                voodoo->params.texBaseAddr2[0] = (val & 0x7ffff) << 3;
+                        voodoo_recalc_tex(voodoo, 0);
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        if (voodoo->type >= VOODOO_BANSHEE)
+                                voodoo->params.texBaseAddr2[1] = val & 0xfffff0;
+                        else
+                                voodoo->params.texBaseAddr2[1] = (val & 0x7ffff) << 3;
+                        voodoo_recalc_tex(voodoo, 1);
+                }
+                break;
+                case SST_texBaseAddr38:
+                if (chip & CHIP_TREX0)
+                {
+                        if (voodoo->type >= VOODOO_BANSHEE)
+                                voodoo->params.texBaseAddr38[0] = val & 0xfffff0;
+                        else
+                                voodoo->params.texBaseAddr38[0] = (val & 0x7ffff) << 3;
+                        voodoo_recalc_tex(voodoo, 0);
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        if (voodoo->type >= VOODOO_BANSHEE)
+                                voodoo->params.texBaseAddr38[1] = val & 0xfffff0;
+                        else
+                                voodoo->params.texBaseAddr38[1] = (val & 0x7ffff) << 3;
+                        voodoo_recalc_tex(voodoo, 1);
+                }
+                break;
+
+                case SST_trexInit1:
+                if (chip & CHIP_TREX0)
+                        voodoo->trexInit1[0] = val;
+                if (chip & CHIP_TREX1)
+                        voodoo->trexInit1[1] = val;
+                break;
+
+                case SST_nccTable0_Y0:
+                if (chip & CHIP_TREX0)
+                {
+                        voodoo->nccTable[0][0].y[0] = val;
+                        voodoo->ncc_dirty[0] = 1;
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        voodoo->nccTable[1][0].y[0] = val;
+                        voodoo->ncc_dirty[1] = 1;
+                }
+                break;
+                case SST_nccTable0_Y1:
+                if (chip & CHIP_TREX0)
+                {
+                        voodoo->nccTable[0][0].y[1] = val;
+                        voodoo->ncc_dirty[0] = 1;
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        voodoo->nccTable[1][0].y[1] = val;
+                        voodoo->ncc_dirty[1] = 1;
+                }
+                break;
+                case SST_nccTable0_Y2:
+                if (chip & CHIP_TREX0)
+                {
+                        voodoo->nccTable[0][0].y[2] = val;
+                        voodoo->ncc_dirty[0] = 1;
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        voodoo->nccTable[1][0].y[2] = val;
+                        voodoo->ncc_dirty[1] = 1;
+                }
+                break;
+                case SST_nccTable0_Y3:
+                if (chip & CHIP_TREX0)
+                {
+                        voodoo->nccTable[0][0].y[3] = val;
+                        voodoo->ncc_dirty[0] = 1;
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        voodoo->nccTable[1][0].y[3] = val;
+                        voodoo->ncc_dirty[1] = 1;
+                }
+                break;
+
+                case SST_nccTable0_I0:
+                if (!(val & (1 << 31)))
+                {
+                        if (chip & CHIP_TREX0)
+                        {
+                                voodoo->nccTable[0][0].i[0] = val;
+                                voodoo->ncc_dirty[0] = 1;
+                        }
+                        if (chip & CHIP_TREX1)
+                        {
+                                voodoo->nccTable[1][0].i[0] = val;
+                                voodoo->ncc_dirty[1] = 1;
+                        }
+                        break;
+                }
+                case SST_nccTable0_I2:
+                if (!(val & (1 << 31)))
+                {
+                        if (chip & CHIP_TREX0)
+                        {
+                                voodoo->nccTable[0][0].i[2] = val;
+                                voodoo->ncc_dirty[0] = 1;
+                        }
+                        if (chip & CHIP_TREX1)
+                        {
+                                voodoo->nccTable[1][0].i[2] = val;
+                                voodoo->ncc_dirty[1] = 1;
+                        }
+                        break;
+                }
+                case SST_nccTable0_Q0:
+                if (!(val & (1 << 31)))
+                {
+                        if (chip & CHIP_TREX0)
+                        {
+                                voodoo->nccTable[0][0].q[0] = val;
+                                voodoo->ncc_dirty[0] = 1;
+                        }
+                        if (chip & CHIP_TREX1)
+                        {
+                                voodoo->nccTable[1][0].q[0] = val;
+                                voodoo->ncc_dirty[1] = 1;
+                        }
+                        break;
+                }
+                case SST_nccTable0_Q2:
+                if (!(val & (1 << 31)))
+                {
+                        if (chip & CHIP_TREX0)
+                        {
+                                voodoo->nccTable[0][0].i[2] = val;
+                                voodoo->ncc_dirty[0] = 1;
+                        }
+                        if (chip & CHIP_TREX1)
+                        {
+                                voodoo->nccTable[1][0].i[2] = val;
+                                voodoo->ncc_dirty[1] = 1;
+                        }
+                        break;
+                }
+                if (val & (1 << 31))
+                {
+                        int p = (val >> 23) & 0xfe;
+                        if (chip & CHIP_TREX0)
+                        {
+                                voodoo->palette[0][p].u = val | 0xff000000;
+                                voodoo->palette_dirty[0] = 1;
+                        }
+                        if (chip & CHIP_TREX1)
+                        {
+                                voodoo->palette[1][p].u = val | 0xff000000;
+                                voodoo->palette_dirty[1] = 1;
+                        }
+                }
+                break;
+
+                case SST_nccTable0_I1:
+                if (!(val & (1 << 31)))
+                {
+                        if (chip & CHIP_TREX0)
+                        {
+                                voodoo->nccTable[0][0].i[1] = val;
+                                voodoo->ncc_dirty[0] = 1;
+                        }
+                        if (chip & CHIP_TREX1)
+                        {
+                                voodoo->nccTable[1][0].i[1] = val;
+                                voodoo->ncc_dirty[1] = 1;
+                        }
+                        break;
+                }
+                case SST_nccTable0_I3:
+                if (!(val & (1 << 31)))
+                {
+                        if (chip & CHIP_TREX0)
+                        {
+                                voodoo->nccTable[0][0].i[3] = val;
+                                voodoo->ncc_dirty[0] = 1;
+                        }
+                        if (chip & CHIP_TREX1)
+                        {
+                                voodoo->nccTable[1][0].i[3] = val;
+                                voodoo->ncc_dirty[1] = 1;
+                        }
+                        break;
+                }
+                case SST_nccTable0_Q1:
+                if (!(val & (1 << 31)))
+                {
+                        if (chip & CHIP_TREX0)
+                        {
+                                voodoo->nccTable[0][0].q[1] = val;
+                                voodoo->ncc_dirty[0] = 1;
+                        }
+                        if (chip & CHIP_TREX1)
+                        {
+                                voodoo->nccTable[1][0].q[1] = val;
+                                voodoo->ncc_dirty[1] = 1;
+                        }
+                        break;
+                }
+                case SST_nccTable0_Q3:
+                if (!(val & (1 << 31)))
+                {
+                        if (chip & CHIP_TREX0)
+                        {
+                                voodoo->nccTable[0][0].q[3] = val;
+                                voodoo->ncc_dirty[0] = 1;
+                        }
+                        if (chip & CHIP_TREX1)
+                        {
+                                voodoo->nccTable[1][0].q[3] = val;
+                                voodoo->ncc_dirty[1] = 1;
+                        }
+                        break;
+                }
+                if (val & (1 << 31))
+                {
+                        int p = ((val >> 23) & 0xfe) | 0x01;
+                        if (chip & CHIP_TREX0)
+                        {
+                                voodoo->palette[0][p].u = val | 0xff000000;
+                                voodoo->palette_dirty[0] = 1;
+                        }
+                        if (chip & CHIP_TREX1)
+                        {
+                                voodoo->palette[1][p].u = val | 0xff000000;
+                                voodoo->palette_dirty[1] = 1;
+                        }
+                }
+                break;
+
+                case SST_nccTable1_Y0:
+                if (chip & CHIP_TREX0)
+                {
+                        voodoo->nccTable[0][1].y[0] = val;
+                        voodoo->ncc_dirty[0] = 1;
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        voodoo->nccTable[1][1].y[0] = val;
+                        voodoo->ncc_dirty[1] = 1;
+                }
+                break;
+                case SST_nccTable1_Y1:
+                if (chip & CHIP_TREX0)
+                {
+                        voodoo->nccTable[0][1].y[1] = val;
+                        voodoo->ncc_dirty[0] = 1;
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        voodoo->nccTable[1][1].y[1] = val;
+                        voodoo->ncc_dirty[1] = 1;
+                }
+                break;
+                case SST_nccTable1_Y2:
+                if (chip & CHIP_TREX0)
+                {
+                        voodoo->nccTable[0][1].y[2] = val;
+                        voodoo->ncc_dirty[0] = 1;
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        voodoo->nccTable[1][1].y[2] = val;
+                        voodoo->ncc_dirty[1] = 1;
+                }
+                break;
+                case SST_nccTable1_Y3:
+                if (chip & CHIP_TREX0)
+                {
+                        voodoo->nccTable[0][1].y[3] = val;
+                        voodoo->ncc_dirty[0] = 1;
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        voodoo->nccTable[1][1].y[3] = val;
+                        voodoo->ncc_dirty[1] = 1;
+                }
+                break;
+                case SST_nccTable1_I0:
+                if (chip & CHIP_TREX0)
+                {
+                        voodoo->nccTable[0][1].i[0] = val;
+                        voodoo->ncc_dirty[0] = 1;
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        voodoo->nccTable[1][1].i[0] = val;
+                        voodoo->ncc_dirty[1] = 1;
+                }
+                break;
+                case SST_nccTable1_I1:
+                if (chip & CHIP_TREX0)
+                {
+                        voodoo->nccTable[0][1].i[1] = val;
+                        voodoo->ncc_dirty[0] = 1;
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        voodoo->nccTable[1][1].i[1] = val;
+                        voodoo->ncc_dirty[1] = 1;
+                }
+                break;
+                case SST_nccTable1_I2:
+                if (chip & CHIP_TREX0)
+                {
+                        voodoo->nccTable[0][1].i[2] = val;
+                        voodoo->ncc_dirty[0] = 1;
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        voodoo->nccTable[1][1].i[2] = val;
+                        voodoo->ncc_dirty[1] = 1;
+                }
+                break;
+                case SST_nccTable1_I3:
+                if (chip & CHIP_TREX0)
+                {
+                        voodoo->nccTable[0][1].i[3] = val;
+                        voodoo->ncc_dirty[0] = 1;
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        voodoo->nccTable[1][1].i[3] = val;
+                        voodoo->ncc_dirty[1] = 1;
+                }
+                break;
+                case SST_nccTable1_Q0:
+                if (chip & CHIP_TREX0)
+                {
+                        voodoo->nccTable[0][1].q[0] = val;
+                        voodoo->ncc_dirty[0] = 1;
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        voodoo->nccTable[1][1].q[0] = val;
+                        voodoo->ncc_dirty[1] = 1;
+                }
+                break;
+                case SST_nccTable1_Q1:
+                if (chip & CHIP_TREX0)
+                {
+                        voodoo->nccTable[0][1].q[1] = val;
+                        voodoo->ncc_dirty[0] = 1;
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        voodoo->nccTable[1][1].q[1] = val;
+                        voodoo->ncc_dirty[1] = 1;
+                }
+                break;
+                case SST_nccTable1_Q2:
+                if (chip & CHIP_TREX0)
+                {
+                        voodoo->nccTable[0][1].q[2] = val;
+                        voodoo->ncc_dirty[0] = 1;
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        voodoo->nccTable[1][1].q[2] = val;
+                        voodoo->ncc_dirty[1] = 1;
+                }
+                break;
+                case SST_nccTable1_Q3:
+                if (chip & CHIP_TREX0)
+                {
+                        voodoo->nccTable[0][1].q[3] = val;
+                        voodoo->ncc_dirty[0] = 1;
+                }
+                if (chip & CHIP_TREX1)
+                {
+                        voodoo->nccTable[1][1].q[3] = val;
+                        voodoo->ncc_dirty[1] = 1;
+                }
+                break;
+
+                case SST_userIntrCMD:
+                fatal("userIntrCMD write %08x from FIFO\n", val);
+                break;
+
+
+                case SST_leftOverlayBuf:
+                voodoo->leftOverlayBuf = val;
+                break;
+        }
+}
diff --git a/pcem/vid_voodoo_reg.h b/pcem/vid_voodoo_reg.h
new file mode 100644 (file)
index 0000000..f3a9418
--- /dev/null
@@ -0,0 +1 @@
+void voodoo_reg_writel(uint32_t addr, uint32_t val, void *p);
diff --git a/pcem/vid_voodoo_regs.h b/pcem/vid_voodoo_regs.h
new file mode 100644 (file)
index 0000000..f7ab208
--- /dev/null
@@ -0,0 +1,691 @@
+enum
+{
+        SST_status = 0x000,
+        SST_intrCtrl = 0x004,
+
+        SST_vertexAx = 0x008,
+        SST_vertexAy = 0x00c,
+        SST_vertexBx = 0x010,
+        SST_vertexBy = 0x014,
+        SST_vertexCx = 0x018,
+        SST_vertexCy = 0x01c,
+
+        SST_startR   = 0x0020,
+        SST_startG   = 0x0024,
+        SST_startB   = 0x0028,
+        SST_startZ   = 0x002c,
+        SST_startA   = 0x0030,
+        SST_startS   = 0x0034,
+        SST_startT   = 0x0038,
+        SST_startW   = 0x003c,
+
+        SST_dRdX     = 0x0040,
+        SST_dGdX     = 0x0044,
+        SST_dBdX     = 0x0048,
+        SST_dZdX     = 0x004c,
+        SST_dAdX     = 0x0050,
+        SST_dSdX     = 0x0054,
+        SST_dTdX     = 0x0058,
+        SST_dWdX     = 0x005c,
+
+        SST_dRdY     = 0x0060,
+        SST_dGdY     = 0x0064,
+        SST_dBdY     = 0x0068,
+        SST_dZdY     = 0x006c,
+        SST_dAdY     = 0x0070,
+        SST_dSdY     = 0x0074,
+        SST_dTdY     = 0x0078,
+        SST_dWdY     = 0x007c,
+
+        SST_triangleCMD = 0x0080,
+
+        SST_fvertexAx = 0x088,
+        SST_fvertexAy = 0x08c,
+        SST_fvertexBx = 0x090,
+        SST_fvertexBy = 0x094,
+        SST_fvertexCx = 0x098,
+        SST_fvertexCy = 0x09c,
+
+        SST_fstartR   = 0x00a0,
+        SST_fstartG   = 0x00a4,
+        SST_fstartB   = 0x00a8,
+        SST_fstartZ   = 0x00ac,
+        SST_fstartA   = 0x00b0,
+        SST_fstartS   = 0x00b4,
+        SST_fstartT   = 0x00b8,
+        SST_fstartW   = 0x00bc,
+
+        SST_fdRdX     = 0x00c0,
+        SST_fdGdX     = 0x00c4,
+        SST_fdBdX     = 0x00c8,
+        SST_fdZdX     = 0x00cc,
+        SST_fdAdX     = 0x00d0,
+        SST_fdSdX     = 0x00d4,
+        SST_fdTdX     = 0x00d8,
+        SST_fdWdX     = 0x00dc,
+
+        SST_fdRdY     = 0x00e0,
+        SST_fdGdY     = 0x00e4,
+        SST_fdBdY     = 0x00e8,
+        SST_fdZdY     = 0x00ec,
+        SST_fdAdY     = 0x00f0,
+        SST_fdSdY     = 0x00f4,
+        SST_fdTdY     = 0x00f8,
+        SST_fdWdY     = 0x00fc,
+
+        SST_ftriangleCMD = 0x0100,
+
+        SST_fbzColorPath = 0x104,
+        SST_fogMode = 0x108,
+
+        SST_alphaMode = 0x10c,
+        SST_fbzMode = 0x110,
+        SST_lfbMode = 0x114,
+
+        SST_clipLeftRight = 0x118,
+        SST_clipLowYHighY = 0x11c,
+
+        SST_nopCMD = 0x120,
+        SST_fastfillCMD = 0x124,
+        SST_swapbufferCMD = 0x128,
+
+        SST_fogColor = 0x12c,
+        SST_zaColor = 0x130,
+        SST_chromaKey = 0x134,
+
+        SST_userIntrCMD = 0x13c,
+        SST_stipple = 0x140,
+        SST_color0 = 0x144,
+        SST_color1 = 0x148,
+
+        SST_fbiPixelsIn = 0x14c,
+        SST_fbiChromaFail = 0x150,
+        SST_fbiZFuncFail = 0x154,
+        SST_fbiAFuncFail = 0x158,
+        SST_fbiPixelsOut = 0x15c,
+
+        SST_fogTable00 = 0x160,
+        SST_fogTable01 = 0x164,
+        SST_fogTable02 = 0x168,
+        SST_fogTable03 = 0x16c,
+        SST_fogTable04 = 0x170,
+        SST_fogTable05 = 0x174,
+        SST_fogTable06 = 0x178,
+        SST_fogTable07 = 0x17c,
+        SST_fogTable08 = 0x180,
+        SST_fogTable09 = 0x184,
+        SST_fogTable0a = 0x188,
+        SST_fogTable0b = 0x18c,
+        SST_fogTable0c = 0x190,
+        SST_fogTable0d = 0x194,
+        SST_fogTable0e = 0x198,
+        SST_fogTable0f = 0x19c,
+        SST_fogTable10 = 0x1a0,
+        SST_fogTable11 = 0x1a4,
+        SST_fogTable12 = 0x1a8,
+        SST_fogTable13 = 0x1ac,
+        SST_fogTable14 = 0x1b0,
+        SST_fogTable15 = 0x1b4,
+        SST_fogTable16 = 0x1b8,
+        SST_fogTable17 = 0x1bc,
+        SST_fogTable18 = 0x1c0,
+        SST_fogTable19 = 0x1c4,
+        SST_fogTable1a = 0x1c8,
+        SST_fogTable1b = 0x1cc,
+        SST_fogTable1c = 0x1d0,
+        SST_fogTable1d = 0x1d4,
+        SST_fogTable1e = 0x1d8,
+        SST_fogTable1f = 0x1dc,
+
+        SST_cmdFifoBaseAddr = 0x1e0,
+        SST_cmdFifoBump = 0x1e4,
+        SST_cmdFifoRdPtr = 0x1e8,
+        SST_cmdFifoAMin = 0x1ec,
+        SST_cmdFifoAMax = 0x1f0,
+        SST_cmdFifoDepth = 0x1f4,
+        SST_cmdFifoHoles = 0x1f8,
+        
+        SST_colBufferAddr = 0x1ec,   /*Banshee*/
+        SST_colBufferStride = 0x1f0, /*Banshee*/
+        SST_auxBufferAddr = 0x1f4,   /*Banshee*/
+        SST_auxBufferStride = 0x1f8, /*Banshee*/
+
+        SST_clipLeftRight1 = 0x200, /*Banshee*/
+        SST_clipTopBottom1 = 0x204, /*Banshee*/
+
+        SST_fbiInit4 = 0x200,
+        SST_vRetrace = 0x204,
+        SST_backPorch = 0x208,
+        SST_videoDimensions = 0x20c,
+        SST_fbiInit0 = 0x210,
+        SST_fbiInit1 = 0x214,
+        SST_fbiInit2 = 0x218,
+        SST_fbiInit3 = 0x21c,
+        SST_hSync = 0x220,
+        SST_vSync = 0x224,
+        SST_clutData = 0x228,
+        SST_dacData = 0x22c,
+
+       SST_scrFilter = 0x230,
+
+        SST_hvRetrace = 0x240,
+        SST_fbiInit5 = 0x244,
+        SST_fbiInit6 = 0x248,
+        SST_fbiInit7 = 0x24c,
+        
+        SST_swapPending = 0x24c, /*Banshee*/
+        SST_leftOverlayBuf = 0x250, /*Banshee*/
+        
+        SST_sSetupMode = 0x260,
+        SST_sVx    = 0x264,
+        SST_sVy    = 0x268,
+        SST_sARGB  = 0x26c,
+        SST_sRed   = 0x270,
+        SST_sGreen = 0x274,
+        SST_sBlue  = 0x278,
+        SST_sAlpha = 0x27c,
+        SST_sVz    = 0x280,
+        SST_sWb    = 0x284,
+        SST_sW0    = 0x288,
+        SST_sS0    = 0x28c,
+        SST_sT0    = 0x290,
+        SST_sW1    = 0x294,
+        SST_sS1    = 0x298,
+        SST_sT1    = 0x29c,
+
+        SST_sDrawTriCMD = 0x2a0,
+        SST_sBeginTriCMD = 0x2a4,
+
+        SST_bltSrcBaseAddr = 0x2c0,
+        SST_bltDstBaseAddr = 0x2c4,
+        SST_bltXYStrides = 0x2c8,
+        SST_bltSrcChromaRange = 0x2cc,
+        SST_bltDstChromaRange = 0x2d0,
+        SST_bltClipX = 0x2d4,
+        SST_bltClipY = 0x2d8,
+
+        SST_bltSrcXY = 0x2e0,
+        SST_bltDstXY = 0x2e4,
+        SST_bltSize = 0x2e8,
+        SST_bltRop = 0x2ec,
+        SST_bltColor = 0x2f0,
+
+        SST_bltCommand = 0x2f8,
+        SST_bltData = 0x2fc,
+
+        SST_textureMode = 0x300,
+        SST_tLOD = 0x304,
+        SST_tDetail = 0x308,
+        SST_texBaseAddr = 0x30c,
+        SST_texBaseAddr1 = 0x310,
+        SST_texBaseAddr2 = 0x314,
+        SST_texBaseAddr38 = 0x318,
+
+        SST_trexInit1 = 0x320,
+
+        SST_nccTable0_Y0 = 0x324,
+        SST_nccTable0_Y1 = 0x328,
+        SST_nccTable0_Y2 = 0x32c,
+        SST_nccTable0_Y3 = 0x330,
+        SST_nccTable0_I0 = 0x334,
+        SST_nccTable0_I1 = 0x338,
+        SST_nccTable0_I2 = 0x33c,
+        SST_nccTable0_I3 = 0x340,
+        SST_nccTable0_Q0 = 0x344,
+        SST_nccTable0_Q1 = 0x348,
+        SST_nccTable0_Q2 = 0x34c,
+        SST_nccTable0_Q3 = 0x350,
+
+        SST_nccTable1_Y0 = 0x354,
+        SST_nccTable1_Y1 = 0x358,
+        SST_nccTable1_Y2 = 0x35c,
+        SST_nccTable1_Y3 = 0x360,
+        SST_nccTable1_I0 = 0x364,
+        SST_nccTable1_I1 = 0x368,
+        SST_nccTable1_I2 = 0x36c,
+        SST_nccTable1_I3 = 0x370,
+        SST_nccTable1_Q0 = 0x374,
+        SST_nccTable1_Q1 = 0x378,
+        SST_nccTable1_Q2 = 0x37c,
+        SST_nccTable1_Q3 = 0x380,
+
+        SST_remap_status = 0x000 | 0x400,
+
+        SST_remap_vertexAx = 0x008 | 0x400,
+        SST_remap_vertexAy = 0x00c | 0x400,
+        SST_remap_vertexBx = 0x010 | 0x400,
+        SST_remap_vertexBy = 0x014 | 0x400,
+        SST_remap_vertexCx = 0x018 | 0x400,
+        SST_remap_vertexCy = 0x01c | 0x400,
+
+        SST_remap_startR   = 0x0020 | 0x400,
+        SST_remap_startG   = 0x002c | 0x400,
+        SST_remap_startB   = 0x0038 | 0x400,
+        SST_remap_startZ   = 0x0044 | 0x400,
+        SST_remap_startA   = 0x0050 | 0x400,
+        SST_remap_startS   = 0x005c | 0x400,
+        SST_remap_startT   = 0x0068 | 0x400,
+        SST_remap_startW   = 0x0074 | 0x400,
+
+        SST_remap_dRdX     = 0x0024 | 0x400,
+        SST_remap_dGdX     = 0x0030 | 0x400,
+        SST_remap_dBdX     = 0x003c | 0x400,
+        SST_remap_dZdX     = 0x0048 | 0x400,
+        SST_remap_dAdX     = 0x0054 | 0x400,
+        SST_remap_dSdX     = 0x0060 | 0x400,
+        SST_remap_dTdX     = 0x006c | 0x400,
+        SST_remap_dWdX     = 0x0078 | 0x400,
+
+        SST_remap_dRdY     = 0x0028 | 0x400,
+        SST_remap_dGdY     = 0x0034 | 0x400,
+        SST_remap_dBdY     = 0x0040 | 0x400,
+        SST_remap_dZdY     = 0x004c | 0x400,
+        SST_remap_dAdY     = 0x0058 | 0x400,
+        SST_remap_dSdY     = 0x0064 | 0x400,
+        SST_remap_dTdY     = 0x0070 | 0x400,
+        SST_remap_dWdY     = 0x007c | 0x400,
+
+        SST_remap_triangleCMD = 0x0080 | 0x400,
+
+        SST_remap_fvertexAx = 0x088 | 0x400,
+        SST_remap_fvertexAy = 0x08c | 0x400,
+        SST_remap_fvertexBx = 0x090 | 0x400,
+        SST_remap_fvertexBy = 0x094 | 0x400,
+        SST_remap_fvertexCx = 0x098 | 0x400,
+        SST_remap_fvertexCy = 0x09c | 0x400,
+
+        SST_remap_fstartR   = 0x00a0 | 0x400,
+        SST_remap_fstartG   = 0x00ac | 0x400,
+        SST_remap_fstartB   = 0x00b8 | 0x400,
+        SST_remap_fstartZ   = 0x00c4 | 0x400,
+        SST_remap_fstartA   = 0x00d0 | 0x400,
+        SST_remap_fstartS   = 0x00dc | 0x400,
+        SST_remap_fstartT   = 0x00e8 | 0x400,
+        SST_remap_fstartW   = 0x00f4 | 0x400,
+
+        SST_remap_fdRdX     = 0x00a4 | 0x400,
+        SST_remap_fdGdX     = 0x00b0 | 0x400,
+        SST_remap_fdBdX     = 0x00bc | 0x400,
+        SST_remap_fdZdX     = 0x00c8 | 0x400,
+        SST_remap_fdAdX     = 0x00d4 | 0x400,
+        SST_remap_fdSdX     = 0x00e0 | 0x400,
+        SST_remap_fdTdX     = 0x00ec | 0x400,
+        SST_remap_fdWdX     = 0x00f8 | 0x400,
+
+        SST_remap_fdRdY     = 0x00a8 | 0x400,
+        SST_remap_fdGdY     = 0x00b4 | 0x400,
+        SST_remap_fdBdY     = 0x00c0 | 0x400,
+        SST_remap_fdZdY     = 0x00cc | 0x400,
+        SST_remap_fdAdY     = 0x00d8 | 0x400,
+        SST_remap_fdSdY     = 0x00e4 | 0x400,
+        SST_remap_fdTdY     = 0x00f0 | 0x400,
+        SST_remap_fdWdY     = 0x00fc | 0x400,
+};
+
+enum
+{
+        LFB_WRITE_FRONT = 0x0000,
+        LFB_WRITE_BACK  = 0x0010,
+        LFB_WRITE_MASK  = 0x0030
+};
+
+enum
+{
+        LFB_READ_FRONT = 0x0000,
+        LFB_READ_BACK  = 0x0040,
+        LFB_READ_AUX   = 0x0080,
+        LFB_READ_MASK  = 0x00c0
+};
+
+enum
+{
+        LFB_FORMAT_RGB565 = 0,
+        LFB_FORMAT_RGB555 = 1,
+        LFB_FORMAT_ARGB1555 = 2,
+        LFB_FORMAT_ARGB8888 = 5,
+        LFB_FORMAT_DEPTH = 15,
+        LFB_FORMAT_MASK = 15
+};
+
+enum
+{
+        LFB_WRITE_COLOUR = 1,
+        LFB_WRITE_DEPTH = 2
+};
+
+enum
+{
+        FBZ_CHROMAKEY = (1 << 1),
+        FBZ_W_BUFFER = (1 << 3),
+        FBZ_DEPTH_ENABLE = (1 << 4),
+
+        FBZ_DITHER      = (1 << 8),
+        FBZ_RGB_WMASK   = (1 << 9),
+        FBZ_DEPTH_WMASK = (1 << 10),
+        FBZ_DITHER_2x2  = (1 << 11),
+
+        FBZ_DRAW_FRONT = 0x0000,
+        FBZ_DRAW_BACK  = 0x4000,
+        FBZ_DRAW_MASK  = 0xc000,
+
+        FBZ_DEPTH_BIAS = (1 << 16),
+
+        FBZ_DEPTH_SOURCE = (1 << 20),
+
+        FBZ_PARAM_ADJUST = (1 << 26)
+};
+
+enum
+{
+        TEX_RGB332 = 0x0,
+        TEX_Y4I2Q2 = 0x1,
+        TEX_A8 = 0x2,
+        TEX_I8 = 0x3,
+        TEX_AI8 = 0x4,
+        TEX_PAL8 = 0x5,
+        TEX_APAL8 = 0x6,
+        TEX_ARGB8332 = 0x8,
+        TEX_A8Y4I2Q2 = 0x9,
+        TEX_R5G6B5 = 0xa,
+        TEX_ARGB1555 = 0xb,
+        TEX_ARGB4444 = 0xc,
+        TEX_A8I8 = 0xd,
+        TEX_APAL88 = 0xe
+};
+
+enum
+{
+        TEXTUREMODE_NCC_SEL = (1 << 5),
+        TEXTUREMODE_TCLAMPS = (1 << 6),
+        TEXTUREMODE_TCLAMPT = (1 << 7),
+        TEXTUREMODE_TRILINEAR = (1 << 30)
+};
+
+enum
+{
+        FBIINIT0_VGA_PASS = 1,
+        FBIINIT0_GRAPHICS_RESET = (1 << 1)
+};
+
+enum
+{
+        FBIINIT1_MULTI_SST = (1 << 2), /*Voodoo Graphics only*/
+        FBIINIT1_VIDEO_RESET = (1 << 8),
+        FBIINIT1_SLI_ENABLE = (1 << 23)
+};
+
+enum
+{
+        FBIINIT2_SWAP_ALGORITHM_MASK = (3 << 9)
+};
+
+enum
+{
+        FBIINIT2_SWAP_ALGORITHM_DAC_VSYNC      = (0 << 9),
+        FBIINIT2_SWAP_ALGORITHM_DAC_DATA       = (1 << 9),
+        FBIINIT2_SWAP_ALGORITHM_PCI_FIFO_STALL = (2 << 9),
+        FBIINIT2_SWAP_ALGORITHM_SLI_SYNC       = (3 << 9)
+};
+
+enum
+{
+        FBIINIT3_REMAP = 1
+};
+
+enum
+{
+        FBIINIT5_MULTI_CVG = (1 << 14)
+};
+
+enum
+{
+        FBIINIT7_CMDFIFO_ENABLE = (1 << 8)
+};
+
+enum
+{
+        CC_LOCALSELECT_ITER_RGB = 0,
+        CC_LOCALSELECT_TEX = 1,
+        CC_LOCALSELECT_COLOR1 = 2,
+        CC_LOCALSELECT_LFB = 3
+};
+
+enum
+{
+        CCA_LOCALSELECT_ITER_A = 0,
+        CCA_LOCALSELECT_COLOR0 = 1,
+        CCA_LOCALSELECT_ITER_Z = 2
+};
+
+enum
+{
+        C_SEL_ITER_RGB = 0,
+        C_SEL_TEX      = 1,
+        C_SEL_COLOR1   = 2,
+        C_SEL_LFB      = 3
+};
+
+enum
+{
+        A_SEL_ITER_A = 0,
+        A_SEL_TEX    = 1,
+        A_SEL_COLOR1 = 2,
+        A_SEL_LFB    = 3
+};
+
+enum
+{
+        CC_MSELECT_ZERO   = 0,
+        CC_MSELECT_CLOCAL = 1,
+        CC_MSELECT_AOTHER = 2,
+        CC_MSELECT_ALOCAL = 3,
+        CC_MSELECT_TEX    = 4,
+        CC_MSELECT_TEXRGB = 5
+};
+
+enum
+{
+        CCA_MSELECT_ZERO    = 0,
+        CCA_MSELECT_ALOCAL  = 1,
+        CCA_MSELECT_AOTHER  = 2,
+        CCA_MSELECT_ALOCAL2 = 3,
+        CCA_MSELECT_TEX     = 4
+};
+
+enum
+{
+        TC_MSELECT_ZERO     = 0,
+        TC_MSELECT_CLOCAL   = 1,
+        TC_MSELECT_AOTHER   = 2,
+        TC_MSELECT_ALOCAL   = 3,
+        TC_MSELECT_DETAIL   = 4,
+        TC_MSELECT_LOD_FRAC = 5
+};
+
+enum
+{
+        TCA_MSELECT_ZERO     = 0,
+        TCA_MSELECT_CLOCAL   = 1,
+        TCA_MSELECT_AOTHER   = 2,
+        TCA_MSELECT_ALOCAL   = 3,
+        TCA_MSELECT_DETAIL   = 4,
+        TCA_MSELECT_LOD_FRAC = 5
+};
+
+enum
+{
+        CC_ADD_CLOCAL = 1,
+        CC_ADD_ALOCAL = 2
+};
+
+enum
+{
+        CCA_ADD_CLOCAL = 1,
+        CCA_ADD_ALOCAL = 2
+};
+
+enum
+{
+        AFUNC_AZERO = 0x0,
+        AFUNC_ASRC_ALPHA = 0x1,
+        AFUNC_A_COLOR = 0x2,
+        AFUNC_ADST_ALPHA = 0x3,
+        AFUNC_AONE = 0x4,
+        AFUNC_AOMSRC_ALPHA = 0x5,
+        AFUNC_AOM_COLOR = 0x6,
+        AFUNC_AOMDST_ALPHA = 0x7,
+        AFUNC_ASATURATE = 0xf
+};
+
+enum
+{
+        AFUNC_ACOLORBEFOREFOG = 0xf
+};
+
+enum
+{
+        AFUNC_NEVER    = 0,
+        AFUNC_LESSTHAN = 1,
+        AFUNC_EQUAL = 2,
+        AFUNC_LESSTHANEQUAL = 3,
+        AFUNC_GREATERTHAN = 4,
+        AFUNC_NOTEQUAL = 5,
+        AFUNC_GREATERTHANEQUAL = 6,
+        AFUNC_ALWAYS = 7
+};
+
+enum
+{
+        DEPTHOP_NEVER    = 0,
+        DEPTHOP_LESSTHAN = 1,
+        DEPTHOP_EQUAL = 2,
+        DEPTHOP_LESSTHANEQUAL = 3,
+        DEPTHOP_GREATERTHAN = 4,
+        DEPTHOP_NOTEQUAL = 5,
+        DEPTHOP_GREATERTHANEQUAL = 6,
+        DEPTHOP_ALWAYS = 7
+};
+
+enum
+{
+        FOG_ENABLE   = 0x01,
+        FOG_ADD      = 0x02,
+        FOG_MULT     = 0x04,
+        FOG_ALPHA    = 0x08,
+        FOG_Z        = 0x10,
+        FOG_W        = 0x18,
+        FOG_CONSTANT = 0x20
+};
+
+enum
+{
+        LOD_ODD            = (1 << 18),
+        LOD_SPLIT          = (1 << 19),
+        LOD_S_IS_WIDER     = (1 << 20),
+        LOD_TMULTIBASEADDR = (1 << 24),
+        LOD_TMIRROR_S      = (1 << 28),
+        LOD_TMIRROR_T      = (1 << 29)
+};
+enum
+{
+        CMD_INVALID = 0,
+        CMD_DRAWTRIANGLE,
+        CMD_FASTFILL,
+        CMD_SWAPBUF
+};
+
+enum
+{
+        FBZCP_TEXTURE_ENABLED = (1 << 27)
+};
+
+enum
+{
+        BLTCMD_SRC_TILED = (1 << 14),
+        BLTCMD_DST_TILED = (1 << 15)
+};
+
+enum
+{
+        INITENABLE_SLI_MASTER_SLAVE = (1 << 11)
+};
+
+enum
+{
+        SETUPMODE_RGB   = (1 << 0),
+        SETUPMODE_ALPHA = (1 << 1),
+        SETUPMODE_Z     = (1 << 2),
+        SETUPMODE_Wb    = (1 << 3),
+        SETUPMODE_W0    = (1 << 4),
+        SETUPMODE_S0_T0 = (1 << 5),
+        SETUPMODE_W1    = (1 << 6),
+        SETUPMODE_S1_T1 = (1 << 7),
+
+        SETUPMODE_STRIP_MODE = (1 << 16),
+        SETUPMODE_CULLING_ENABLE = (1 << 17),
+        SETUPMODE_CULLING_SIGN = (1 << 18),
+        SETUPMODE_DISABLE_PINGPONG = (1 << 19)
+};
+
+#define TEXTUREMODE_MASK 0x3ffff000
+#define TEXTUREMODE_PASSTHROUGH 0
+
+#define TEXTUREMODE_LOCAL_MASK 0x00643000
+#define TEXTUREMODE_LOCAL  0x00241000
+
+
+#define SLI_ENABLED (voodoo->fbiInit1 & FBIINIT1_SLI_ENABLE)
+#define TRIPLE_BUFFER ((voodoo->fbiInit2 & 0x10) || (voodoo->fbiInit5 & 0x600) == 0x400)
+
+
+#define _rgb_sel                 ( params->fbzColorPath & 3)
+#define a_sel                   ( (params->fbzColorPath >> 2) & 3)
+#define cc_localselect          ( params->fbzColorPath & (1 << 4))
+#define cca_localselect         ( (params->fbzColorPath >> 5) & 3)
+#define cc_localselect_override ( params->fbzColorPath & (1 << 7))
+#define cc_zero_other           ( params->fbzColorPath & (1 << 8))
+#define cc_sub_clocal           ( params->fbzColorPath & (1 << 9))
+#define cc_mselect              ( (params->fbzColorPath >> 10) & 7)
+#define cc_reverse_blend        ( params->fbzColorPath & (1 << 13))
+#define cc_add                  ( (params->fbzColorPath >> 14) & 3)
+#define cc_add_alocal           ( params->fbzColorPath & (1 << 15))
+#define cc_invert_output        ( params->fbzColorPath & (1 << 16))
+#define cca_zero_other          ( params->fbzColorPath & (1 << 17))
+#define cca_sub_clocal          ( params->fbzColorPath & (1 << 18))
+#define cca_mselect             ( (params->fbzColorPath >> 19) & 7)
+#define cca_reverse_blend       ( params->fbzColorPath & (1 << 22))
+#define cca_add                 ( (params->fbzColorPath >> 23) & 3)
+#define cca_invert_output       ( params->fbzColorPath & (1 << 25))
+#define tc_zero_other (params->textureMode[0] & (1 << 12))
+#define tc_sub_clocal (params->textureMode[0] & (1 << 13))
+#define tc_mselect    ((params->textureMode[0] >> 14) & 7)
+#define tc_reverse_blend (params->textureMode[0] & (1 << 17))
+#define tc_add_clocal (params->textureMode[0] & (1 << 18))
+#define tc_add_alocal (params->textureMode[0] & (1 << 19))
+#define tc_invert_output (params->textureMode[0] & (1 << 20))
+#define tca_zero_other (params->textureMode[0] & (1 << 21))
+#define tca_sub_clocal (params->textureMode[0] & (1 << 22))
+#define tca_mselect    ((params->textureMode[0] >> 23) & 7)
+#define tca_reverse_blend (params->textureMode[0] & (1 << 26))
+#define tca_add_clocal (params->textureMode[0] & (1 << 27))
+#define tca_add_alocal (params->textureMode[0] & (1 << 28))
+#define tca_invert_output (params->textureMode[0] & (1 << 29))
+
+#define tc_sub_clocal_1 (params->textureMode[1] & (1 << 13))
+#define tc_mselect_1    ((params->textureMode[1] >> 14) & 7)
+#define tc_reverse_blend_1 (params->textureMode[1] & (1 << 17))
+#define tc_add_clocal_1 (params->textureMode[1] & (1 << 18))
+#define tc_add_alocal_1 (params->textureMode[1] & (1 << 19))
+#define tca_sub_clocal_1 (params->textureMode[1] & (1 << 22))
+#define tca_mselect_1    ((params->textureMode[1] >> 23) & 7)
+#define tca_reverse_blend_1 (params->textureMode[1] & (1 << 26))
+#define tca_add_clocal_1 (params->textureMode[1] & (1 << 27))
+#define tca_add_alocal_1 (params->textureMode[1] & (1 << 28))
+
+#define src_afunc ( (params->alphaMode >> 8) & 0xf)
+#define dest_afunc ( (params->alphaMode >> 12) & 0xf)
+#define alpha_func ( (params->alphaMode >> 1) & 7)
+#define a_ref ( params->alphaMode >> 24)
+#define depth_op ( (params->fbzMode >> 5) & 7)
+#define dither ( params->fbzMode & FBZ_DITHER)
+#define dither2x2 (params->fbzMode & FBZ_DITHER_2x2)
diff --git a/pcem/vid_voodoo_render.cpp b/pcem/vid_voodoo_render.cpp
new file mode 100644 (file)
index 0000000..2116180
--- /dev/null
@@ -0,0 +1,1640 @@
+#include <math.h>
+#include <stddef.h>
+#include "ibm.h"
+#include "device.h"
+#include "mem.h"
+#include "thread.h"
+#include "video.h"
+#include "vid_svga.h"
+#include "vid_voodoo.h"
+#include "vid_voodoo_common.h"
+#include "vid_voodoo_dither.h"
+#include "vid_voodoo_regs.h"
+#include "vid_voodoo_render.h"
+#include "vid_voodoo_texture.h"
+
+typedef struct voodoo_state_t
+{
+        int xstart, xend, xdir;
+        uint32_t base_r, base_g, base_b, base_a, base_z;
+        struct
+        {
+                int64_t base_s, base_t, base_w;
+                int lod;
+        } tmu[2];
+        int64_t base_w;
+        int lod;
+        int lod_min[2], lod_max[2];
+        int dx1, dx2;
+        int y, yend, ydir;
+        int32_t dxAB, dxAC, dxBC;
+        int tex_b[2], tex_g[2], tex_r[2], tex_a[2];
+        int tex_s, tex_t;
+        int clamp_s[2], clamp_t[2];
+
+        int32_t vertexAx, vertexAy, vertexBx, vertexBy, vertexCx, vertexCy;
+
+        uint32_t *tex[2][LOD_MAX+1];
+        int tformat;
+
+        int *tex_w_mask[2];
+        int *tex_h_mask[2];
+        int *tex_shift[2];
+        int *tex_lod[2];
+
+        uint16_t *fb_mem, *aux_mem;
+
+        int32_t ib, ig, ir, ia;
+        int32_t z;
+
+        int32_t new_depth;
+
+        int64_t tmu0_s, tmu0_t;
+        int64_t tmu0_w;
+        int64_t tmu1_s, tmu1_t;
+        int64_t tmu1_w;
+        int64_t w;
+
+        int pixel_count, texel_count;
+        int x, x2, x_tiled;
+
+        uint32_t w_depth;
+
+        float log_temp;
+        uint32_t ebp_store;
+        uint32_t texBaseAddr;
+
+        int lod_frac[2];
+} voodoo_state_t;
+
+static int voodoo_output = 0;
+
+static uint8_t logtable[256] =
+{
+        0x00,0x01,0x02,0x04,0x05,0x07,0x08,0x09,0x0b,0x0c,0x0e,0x0f,0x10,0x12,0x13,0x15,
+        0x16,0x17,0x19,0x1a,0x1b,0x1d,0x1e,0x1f,0x21,0x22,0x23,0x25,0x26,0x27,0x28,0x2a,
+        0x2b,0x2c,0x2e,0x2f,0x30,0x31,0x33,0x34,0x35,0x36,0x38,0x39,0x3a,0x3b,0x3d,0x3e,
+        0x3f,0x40,0x41,0x43,0x44,0x45,0x46,0x47,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x50,0x51,
+        0x52,0x53,0x54,0x55,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x60,0x61,0x62,0x63,
+        0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6c,0x6d,0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,
+        0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,0x80,0x81,0x83,0x84,0x85,
+        0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8c,0x8d,0x8e,0x8f,0x90,0x91,0x92,0x93,0x94,
+        0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,0xa0,0xa1,0xa2,0xa2,0xa3,
+        0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xad,0xae,0xaf,0xb0,0xb1,0xb2,
+        0xb3,0xb4,0xb5,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbc,0xbd,0xbe,0xbf,0xc0,
+        0xc1,0xc2,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xcd,
+        0xce,0xcf,0xd0,0xd1,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd6,0xd7,0xd8,0xd9,0xda,0xda,
+        0xdb,0xdc,0xdd,0xde,0xde,0xdf,0xe0,0xe1,0xe1,0xe2,0xe3,0xe4,0xe5,0xe5,0xe6,0xe7,
+        0xe8,0xe8,0xe9,0xea,0xeb,0xeb,0xec,0xed,0xee,0xef,0xef,0xf0,0xf1,0xf2,0xf2,0xf3,
+        0xf4,0xf5,0xf5,0xf6,0xf7,0xf7,0xf8,0xf9,0xfa,0xfa,0xfb,0xfc,0xfd,0xfd,0xfe,0xff
+};
+
+static inline int fastlog(uint64_t val)
+{
+        uint64_t oldval = val;
+        int exp = 63;
+        int frac;
+
+        if (!val || val & (1ULL << 63))
+                return 0x80000000;
+
+        if (!(val & 0xffffffff00000000))
+        {
+                exp -= 32;
+                val <<= 32;
+        }
+        if (!(val & 0xffff000000000000))
+        {
+                exp -= 16;
+                val <<= 16;
+        }
+        if (!(val & 0xff00000000000000))
+        {
+                exp -= 8;
+                val <<= 8;
+        }
+        if (!(val & 0xf000000000000000))
+        {
+                exp -= 4;
+                val <<= 4;
+        }
+        if (!(val & 0xc000000000000000))
+        {
+                exp -= 2;
+                val <<= 2;
+        }
+        if (!(val & 0x8000000000000000))
+        {
+                exp -= 1;
+                val <<= 1;
+        }
+
+        if (exp >= 8)
+                frac = (oldval >> (exp - 8)) & 0xff;
+        else
+                frac = (oldval << (8 - exp)) & 0xff;
+
+        return (exp << 8) | logtable[frac];
+}
+
+static inline int voodoo_fls(uint16_t val)
+{
+        int num = 0;
+
+//pclog("fls(%04x) = ", val);
+        if (!(val & 0xff00))
+        {
+                num += 8;
+                val <<= 8;
+        }
+        if (!(val & 0xf000))
+        {
+                num += 4;
+                val <<= 4;
+        }
+        if (!(val & 0xc000))
+        {
+                num += 2;
+                val <<= 2;
+        }
+        if (!(val & 0x8000))
+        {
+                num += 1;
+                val <<= 1;
+        }
+//pclog("%i %04x\n", num, val);
+        return num;
+}
+
+typedef struct voodoo_texture_state_t
+{
+        int s, t;
+        int w_mask, h_mask;
+        int tex_shift;
+} voodoo_texture_state_t;
+
+static inline void tex_read(voodoo_state_t *state, voodoo_texture_state_t *texture_state, int tmu)
+{
+        uint32_t dat;
+
+        if (texture_state->s & ~texture_state->w_mask)
+        {
+                if (state->clamp_s[tmu])
+                {
+                        if (texture_state->s < 0)
+                                texture_state->s = 0;
+                        if (texture_state->s > texture_state->w_mask)
+                                texture_state->s = texture_state->w_mask;
+                }
+                else
+                        texture_state->s &= texture_state->w_mask;
+        }
+        if (texture_state->t & ~texture_state->h_mask)
+        {
+                if (state->clamp_t[tmu])
+                {
+                        if (texture_state->t < 0)
+                                texture_state->t = 0;
+                        if (texture_state->t > texture_state->h_mask)
+                                texture_state->t = texture_state->h_mask;
+                }
+                else
+                        texture_state->t &= texture_state->h_mask;
+        }
+
+        dat = state->tex[tmu][state->lod][texture_state->s + (texture_state->t << texture_state->tex_shift)];
+
+        state->tex_b[tmu] = dat & 0xff;
+        state->tex_g[tmu] = (dat >> 8) & 0xff;
+        state->tex_r[tmu] = (dat >> 16) & 0xff;
+        state->tex_a[tmu] = (dat >> 24) & 0xff;
+}
+
+#define LOW4(x)  ((x & 0x0f) | ((x & 0x0f) << 4))
+#define HIGH4(x) ((x & 0xf0) | ((x & 0xf0) >> 4))
+
+static inline void tex_read_4(voodoo_state_t *state, voodoo_texture_state_t *texture_state, int s, int t, int *d, int tmu, int x)
+{
+        rgba_u dat[4];
+
+        if (((s | (s + 1)) & ~texture_state->w_mask) || ((t | (t + 1)) & ~texture_state->h_mask))
+        {
+                int c;
+                for (c = 0; c < 4; c++)
+                {
+                        int _s = s + (c & 1);
+                        int _t = t + ((c & 2) >> 1);
+
+                        if (_s & ~texture_state->w_mask)
+                        {
+                                if (state->clamp_s[tmu])
+                                {
+                                        if (_s < 0)
+                                                _s = 0;
+                                        if (_s > texture_state->w_mask)
+                                                _s = texture_state->w_mask;
+                                }
+                                else
+                                        _s &= texture_state->w_mask;
+                        }
+                        if (_t & ~texture_state->h_mask)
+                        {
+                                if (state->clamp_t[tmu])
+                                {
+                                        if (_t < 0)
+                                                _t = 0;
+                                        if (_t > texture_state->h_mask)
+                                                _t = texture_state->h_mask;
+                                }
+                                else
+                                        _t &= texture_state->h_mask;
+                        }
+                        dat[c].u = state->tex[tmu][state->lod][_s + (_t << texture_state->tex_shift)];
+                }
+        }
+        else
+        {
+                dat[0].u = state->tex[tmu][state->lod][s +     (t << texture_state->tex_shift)];
+                dat[1].u = state->tex[tmu][state->lod][s + 1 + (t << texture_state->tex_shift)];
+                dat[2].u = state->tex[tmu][state->lod][s +     ((t + 1) << texture_state->tex_shift)];
+                dat[3].u = state->tex[tmu][state->lod][s + 1 + ((t + 1) << texture_state->tex_shift)];
+        }
+
+        state->tex_r[tmu] = (dat[0].rgba.r * d[0] + dat[1].rgba.r * d[1] + dat[2].rgba.r * d[2] + dat[3].rgba.r * d[3]) >> 8;
+        state->tex_g[tmu] = (dat[0].rgba.g * d[0] + dat[1].rgba.g * d[1] + dat[2].rgba.g * d[2] + dat[3].rgba.g * d[3]) >> 8;
+        state->tex_b[tmu] = (dat[0].rgba.b * d[0] + dat[1].rgba.b * d[1] + dat[2].rgba.b * d[2] + dat[3].rgba.b * d[3]) >> 8;
+        state->tex_a[tmu] = (dat[0].rgba.a * d[0] + dat[1].rgba.a * d[1] + dat[2].rgba.a * d[2] + dat[3].rgba.a * d[3]) >> 8;
+}
+
+static inline void voodoo_get_texture(voodoo_t *voodoo, voodoo_params_t *params, voodoo_state_t *state, int tmu, int x)
+{
+        voodoo_texture_state_t texture_state;
+        int d[4];
+        int s, t;
+        int tex_lod = state->tex_lod[tmu][state->lod];
+
+        texture_state.w_mask = state->tex_w_mask[tmu][state->lod];
+        texture_state.h_mask = state->tex_h_mask[tmu][state->lod];
+        texture_state.tex_shift = 8 - tex_lod;
+
+        if (params->tLOD[tmu] & LOD_TMIRROR_S)
+        {
+                if (state->tex_s & 0x1000)
+                        state->tex_s = ~state->tex_s;
+        }
+        if (params->tLOD[tmu] & LOD_TMIRROR_T)
+        {
+                if (state->tex_t & 0x1000)
+                        state->tex_t = ~state->tex_t;
+        }
+
+        if (voodoo->bilinear_enabled && params->textureMode[tmu] & 6)
+        {
+                int _ds, dt;
+
+                state->tex_s -= 1 << (3+tex_lod);
+                state->tex_t -= 1 << (3+tex_lod);
+
+                s = state->tex_s >> tex_lod;
+                t = state->tex_t >> tex_lod;
+
+                _ds = s & 0xf;
+                dt = t & 0xf;
+
+                s >>= 4;
+                t >>= 4;
+//if (x == 80)
+//if (voodoo_output)
+//        pclog("s=%08x t=%08x _ds=%02x _dt=%02x\n", s, t, _ds, dt);
+                d[0] = (16 - _ds) * (16 - dt);
+                d[1] =  _ds * (16 - dt);
+                d[2] = (16 - _ds) * dt;
+                d[3] = _ds * dt;
+
+//                texture_state.s = s;
+//                texture_state.t = t;
+                tex_read_4(state, &texture_state, s, t, d, tmu, x);
+
+
+/*                state->tex_r = (tex_samples[0].rgba.r * d[0] + tex_samples[1].rgba.r * d[1] + tex_samples[2].rgba.r * d[2] + tex_samples[3].rgba.r * d[3]) >> 8;
+                state->tex_g = (tex_samples[0].rgba.g * d[0] + tex_samples[1].rgba.g * d[1] + tex_samples[2].rgba.g * d[2] + tex_samples[3].rgba.g * d[3]) >> 8;
+                state->tex_b = (tex_samples[0].rgba.b * d[0] + tex_samples[1].rgba.b * d[1] + tex_samples[2].rgba.b * d[2] + tex_samples[3].rgba.b * d[3]) >> 8;
+                state->tex_a = (tex_samples[0].rgba.a * d[0] + tex_samples[1].rgba.a * d[1] + tex_samples[2].rgba.a * d[2] + tex_samples[3].rgba.a * d[3]) >> 8;*/
+/*                state->tex_r = tex_samples[0].r;
+                state->tex_g = tex_samples[0].g;
+                state->tex_b = tex_samples[0].b;
+                state->tex_a = tex_samples[0].a;*/
+        }
+        else
+        {
+        //        rgba_t tex_samples;
+        //        voodoo_texture_state_t texture_state;
+//                int s = state->tex_s >> (18+state->lod);
+//                int t = state->tex_t >> (18+state->lod);
+        //        int s, t;
+
+//                state->tex_s -= 1 << (17+state->lod);
+//                state->tex_t -= 1 << (17+state->lod);
+
+                s = state->tex_s >> (4+tex_lod);
+                t = state->tex_t >> (4+tex_lod);
+
+                texture_state.s = s;
+                texture_state.t = t;
+                tex_read(state, &texture_state, tmu);
+
+/*                state->tex_r = tex_samples[0].rgba.r;
+                state->tex_g = tex_samples[0].rgba.g;
+                state->tex_b = tex_samples[0].rgba.b;
+                state->tex_a = tex_samples[0].rgba.a;*/
+        }
+}
+
+static inline void voodoo_tmu_fetch(voodoo_t *voodoo, voodoo_params_t *params, voodoo_state_t *state, int tmu, int x)
+{
+        if (params->textureMode[tmu] & 1)
+        {
+                int64_t _w = 0;
+
+                if (tmu)
+                {
+                        if (state->tmu1_w)
+                                _w = (int64_t)((1ULL << 48) / state->tmu1_w);
+                        state->tex_s = (int32_t)(((((state->tmu1_s + (1 << 13)) >> 14) * _w) + (1 << 29))  >> 30);
+                        state->tex_t = (int32_t)(((((state->tmu1_t + (1 << 13))  >> 14)  * _w) + (1 << 29))  >> 30);
+                }
+                else
+                {
+                        if (state->tmu0_w)
+                                _w = (int64_t)((1ULL << 48) / state->tmu0_w);
+                        state->tex_s = (int32_t)(((((state->tmu0_s + (1 << 13))  >> 14) * _w) + (1 << 29)) >> 30);
+                        state->tex_t = (int32_t)(((((state->tmu0_t + (1 << 13))  >> 14)  * _w) + (1 << 29))  >> 30);
+                }
+
+                state->lod = state->tmu[tmu].lod + (fastlog(_w) - (19 << 8));
+        }
+        else
+        {
+                if (tmu)
+                {
+                        state->tex_s = (int32_t)(state->tmu1_s >> (14+14));
+                        state->tex_t = (int32_t)(state->tmu1_t >> (14+14));
+                }
+                else
+                {
+                        state->tex_s = (int32_t)(state->tmu0_s >> (14+14));
+                        state->tex_t = (int32_t)(state->tmu0_t >> (14+14));
+                }
+                state->lod = state->tmu[tmu].lod;
+        }
+
+        if (state->lod < state->lod_min[tmu])
+                state->lod = state->lod_min[tmu];
+        else if (state->lod > state->lod_max[tmu])
+                state->lod = state->lod_max[tmu];
+        state->lod_frac[tmu] = state->lod & 0xff;
+        state->lod >>= 8;
+
+        voodoo_get_texture(voodoo, params, state, tmu, x);
+}
+
+
+/*Perform texture fetch and blending for both TMUs*/
+static inline void voodoo_tmu_fetch_and_blend(voodoo_t *voodoo, voodoo_params_t *params, voodoo_state_t *state, int x)
+{
+        int r,g,b,a;
+        int c_reverse, a_reverse;
+//        int c_reverse1, a_reverse1;
+        int factor_r = 0, factor_g = 0, factor_b = 0, factor_a = 0;
+
+        voodoo_tmu_fetch(voodoo, params, state, 1, x);
+
+        if ((params->textureMode[1] & TEXTUREMODE_TRILINEAR) && (state->lod & 1))
+        {
+                c_reverse = tc_reverse_blend;
+                a_reverse = tca_reverse_blend;
+        }
+        else
+        {
+                c_reverse = !tc_reverse_blend;
+                a_reverse = !tca_reverse_blend;
+        }
+/*        c_reverse1 = c_reverse;
+        a_reverse1 = a_reverse;*/
+        if (tc_sub_clocal_1)
+        {
+                switch (tc_mselect_1)
+                {
+                        case TC_MSELECT_ZERO:
+                        factor_r = factor_g = factor_b = 0;
+                        break;
+                        case TC_MSELECT_CLOCAL:
+                        factor_r = state->tex_r[1];
+                        factor_g = state->tex_g[1];
+                        factor_b = state->tex_b[1];
+                        break;
+                        case TC_MSELECT_AOTHER:
+                        factor_r = factor_g = factor_b = 0;
+                        break;
+                        case TC_MSELECT_ALOCAL:
+                        factor_r = factor_g = factor_b = state->tex_a[1];
+                        break;
+                        case TC_MSELECT_DETAIL:
+                        factor_r = (params->detail_bias[1] - state->lod) << params->detail_scale[1];
+                        if (factor_r > params->detail_max[1])
+                                factor_r = params->detail_max[1];
+                        factor_g = factor_b = factor_r;
+                        break;
+                        case TC_MSELECT_LOD_FRAC:
+                        factor_r = factor_g = factor_b = state->lod_frac[1];
+                        break;
+                }
+                if (!c_reverse)
+                {
+                        r = (-state->tex_r[1] * (factor_r + 1)) >> 8;
+                        g = (-state->tex_g[1] * (factor_g + 1)) >> 8;
+                        b = (-state->tex_b[1] * (factor_b + 1)) >> 8;
+                }
+                else
+                {
+                        r = (-state->tex_r[1] * ((factor_r^0xff) + 1)) >> 8;
+                        g = (-state->tex_g[1] * ((factor_g^0xff) + 1)) >> 8;
+                        b = (-state->tex_b[1] * ((factor_b^0xff) + 1)) >> 8;
+                }
+                if (tc_add_clocal_1)
+                {
+                        r += state->tex_r[1];
+                        g += state->tex_g[1];
+                        b += state->tex_b[1];
+                }
+                else if (tc_add_alocal_1)
+                {
+                        r += state->tex_a[1];
+                        g += state->tex_a[1];
+                        b += state->tex_a[1];
+                }
+                state->tex_r[1] = CLAMP(r);
+                state->tex_g[1] = CLAMP(g);
+                state->tex_b[1] = CLAMP(b);
+        }
+        if (tca_sub_clocal_1)
+        {
+                switch (tca_mselect_1)
+                {
+                        case TCA_MSELECT_ZERO:
+                        factor_a = 0;
+                        break;
+                        case TCA_MSELECT_CLOCAL:
+                        factor_a = state->tex_a[1];
+                        break;
+                        case TCA_MSELECT_AOTHER:
+                        factor_a = 0;
+                        break;
+                        case TCA_MSELECT_ALOCAL:
+                        factor_a = state->tex_a[1];
+                        break;
+                        case TCA_MSELECT_DETAIL:
+                        factor_a = (params->detail_bias[1] - state->lod) << params->detail_scale[1];
+                        if (factor_a > params->detail_max[1])
+                                factor_a = params->detail_max[1];
+                        break;
+                        case TCA_MSELECT_LOD_FRAC:
+                        factor_a = state->lod_frac[1];
+                        break;
+                }
+                if (!a_reverse)
+                        a = (-state->tex_a[1] * ((factor_a ^ 0xff) + 1)) >> 8;
+                else
+                        a = (-state->tex_a[1] * (factor_a + 1)) >> 8;
+                if (tca_add_clocal_1 || tca_add_alocal_1)
+                        a += state->tex_a[1];
+                state->tex_a[1] = CLAMP(a);
+        }
+
+
+        voodoo_tmu_fetch(voodoo, params, state, 0, x);
+
+        if ((params->textureMode[0] & TEXTUREMODE_TRILINEAR) && (state->lod & 1))
+        {
+                c_reverse = tc_reverse_blend;
+                a_reverse = tca_reverse_blend;
+        }
+        else
+        {
+                c_reverse = !tc_reverse_blend;
+                a_reverse = !tca_reverse_blend;
+        }
+
+        if (!tc_zero_other)
+        {
+                r = state->tex_r[1];
+                g = state->tex_g[1];
+                b = state->tex_b[1];
+        }
+        else
+                r = g = b = 0;
+        if (tc_sub_clocal)
+        {
+                r -= state->tex_r[0];
+                g -= state->tex_g[0];
+                b -= state->tex_b[0];
+        }
+        switch (tc_mselect)
+        {
+                case TC_MSELECT_ZERO:
+                factor_r = factor_g = factor_b = 0;
+                break;
+                case TC_MSELECT_CLOCAL:
+                factor_r = state->tex_r[0];
+                factor_g = state->tex_g[0];
+                factor_b = state->tex_b[0];
+                break;
+                case TC_MSELECT_AOTHER:
+                factor_r = factor_g = factor_b = state->tex_a[1];
+                break;
+                case TC_MSELECT_ALOCAL:
+                factor_r = factor_g = factor_b = state->tex_a[0];
+                break;
+                case TC_MSELECT_DETAIL:
+                factor_r = (params->detail_bias[0] - state->lod) << params->detail_scale[0];
+                if (factor_r > params->detail_max[0])
+                        factor_r = params->detail_max[0];
+                factor_g = factor_b = factor_r;
+                break;
+                case TC_MSELECT_LOD_FRAC:
+                factor_r = factor_g = factor_b = state->lod_frac[0];
+                break;
+        }
+        if (!c_reverse)
+        {
+                r = (r * (factor_r + 1)) >> 8;
+                g = (g * (factor_g + 1)) >> 8;
+                b = (b * (factor_b + 1)) >> 8;
+        }
+        else
+        {
+                r = (r * ((factor_r^0xff) + 1)) >> 8;
+                g = (g * ((factor_g^0xff) + 1)) >> 8;
+                b = (b * ((factor_b^0xff) + 1)) >> 8;
+        }
+        if (tc_add_clocal)
+        {
+                r += state->tex_r[0];
+                g += state->tex_g[0];
+                b += state->tex_b[0];
+        }
+        else if (tc_add_alocal)
+        {
+                r += state->tex_a[0];
+                g += state->tex_a[0];
+                b += state->tex_a[0];
+        }
+
+        if (!tca_zero_other)
+                a = state->tex_a[1];
+        else
+                a = 0;
+        if (tca_sub_clocal)
+                a -= state->tex_a[0];
+        switch (tca_mselect)
+        {
+                case TCA_MSELECT_ZERO:
+                factor_a = 0;
+                break;
+                case TCA_MSELECT_CLOCAL:
+                factor_a = state->tex_a[0];
+                break;
+                case TCA_MSELECT_AOTHER:
+                factor_a = state->tex_a[1];
+                break;
+                case TCA_MSELECT_ALOCAL:
+                factor_a = state->tex_a[0];
+                break;
+                case TCA_MSELECT_DETAIL:
+                factor_a = (params->detail_bias[0] - state->lod) << params->detail_scale[0];
+                if (factor_a > params->detail_max[0])
+                        factor_a = params->detail_max[0];
+                break;
+                case TCA_MSELECT_LOD_FRAC:
+                factor_a = state->lod_frac[0];
+                break;
+        }
+        if (a_reverse)
+                a = (a * ((factor_a ^ 0xff) + 1)) >> 8;
+        else
+                a = (a * (factor_a + 1)) >> 8;
+        if (tca_add_clocal || tca_add_alocal)
+                a += state->tex_a[0];
+
+
+        state->tex_r[0] = CLAMP(r);
+        state->tex_g[0] = CLAMP(g);
+        state->tex_b[0] = CLAMP(b);
+        state->tex_a[0] = CLAMP(a);
+
+        if (tc_invert_output)
+        {
+                state->tex_r[0] ^= 0xff;
+                state->tex_g[0] ^= 0xff;
+                state->tex_b[0] ^= 0xff;
+        }
+        if (tca_invert_output)
+                state->tex_a[0] ^= 0xff;
+}
+
+#if (defined i386 || defined __i386 || defined __i386__ || defined _X86_ || defined WIN32 || defined _WIN32 || defined _WIN32) && !(defined __amd64__)
+#include "vid_voodoo_codegen_x86.h"
+#elif (defined __amd64__)
+#include "vid_voodoo_codegen_x86-64.h"
+#else
+int voodoo_recomp = 0;
+#endif
+
+static void voodoo_half_triangle(voodoo_t *voodoo, voodoo_params_t *params, voodoo_state_t *state, int ystart, int yend, int odd_even)
+{
+/*        int rgb_sel                 = params->fbzColorPath & 3;
+        int a_sel                   = (params->fbzColorPath >> 2) & 3;
+        int cc_localselect          = params->fbzColorPath & (1 << 4);
+        int cca_localselect         = (params->fbzColorPath >> 5) & 3;
+        int cc_localselect_override = params->fbzColorPath & (1 << 7);
+        int cc_zero_other           = params->fbzColorPath & (1 << 8);
+        int cc_sub_clocal           = params->fbzColorPath & (1 << 9);
+        int cc_mselect              = (params->fbzColorPath >> 10) & 7;
+        int cc_reverse_blend        = params->fbzColorPath & (1 << 13);
+        int cc_add                  = (params->fbzColorPath >> 14) & 3;
+        int cc_add_alocal           = params->fbzColorPath & (1 << 15);
+        int cc_invert_output        = params->fbzColorPath & (1 << 16);
+        int cca_zero_other          = params->fbzColorPath & (1 << 17);
+        int cca_sub_clocal          = params->fbzColorPath & (1 << 18);
+        int cca_mselect             = (params->fbzColorPath >> 19) & 7;
+        int cca_reverse_blend       = params->fbzColorPath & (1 << 22);
+        int cca_add                 = (params->fbzColorPath >> 23) & 3;
+        int cca_invert_output       = params->fbzColorPath & (1 << 25);
+        int src_afunc = (params->alphaMode >> 8) & 0xf;
+        int dest_afunc = (params->alphaMode >> 12) & 0xf;
+        int alpha_func = (params->alphaMode >> 1) & 7;
+        int a_ref = params->alphaMode >> 24;
+        int depth_op = (params->fbzMode >> 5) & 7;
+        int dither = params->fbzMode & FBZ_DITHER;*/
+        int texels;
+        int c;
+#ifndef NO_CODEGEN
+        uint8_t (*voodoo_draw)(voodoo_state_t *state, voodoo_params_t *params, int x, int real_y);
+#endif
+        int y_diff = SLI_ENABLED ? 2 : 1;
+
+        if ((params->textureMode[0] & TEXTUREMODE_MASK) == TEXTUREMODE_PASSTHROUGH ||
+            (params->textureMode[0] & TEXTUREMODE_LOCAL_MASK) == TEXTUREMODE_LOCAL)
+                texels = 1;
+        else
+                texels = 2;
+
+        state->clamp_s[0] = params->textureMode[0] & TEXTUREMODE_TCLAMPS;
+        state->clamp_t[0] = params->textureMode[0] & TEXTUREMODE_TCLAMPT;
+        state->clamp_s[1] = params->textureMode[1] & TEXTUREMODE_TCLAMPS;
+        state->clamp_t[1] = params->textureMode[1] & TEXTUREMODE_TCLAMPT;
+//        int last_x;
+//        pclog("voodoo_triangle : bottom-half %X %X %X %X %X %i  %i %i %i\n", xstart, xend, dx1, dx2, dx2 * 36, xdir,  y, yend, ydir);
+
+        for (c = 0; c <= LOD_MAX; c++)
+        {
+                state->tex[0][c] = &voodoo->texture_cache[0][params->tex_entry[0]].data[texture_offset[c]];
+                state->tex[1][c] = &voodoo->texture_cache[1][params->tex_entry[1]].data[texture_offset[c]];
+        }
+
+        state->tformat = params->tformat[0];
+
+        state->tex_w_mask[0] = params->tex_w_mask[0];
+        state->tex_h_mask[0] = params->tex_h_mask[0];
+        state->tex_shift[0] = params->tex_shift[0];
+        state->tex_lod[0] = params->tex_lod[0];
+        state->tex_w_mask[1] = params->tex_w_mask[1];
+        state->tex_h_mask[1] = params->tex_h_mask[1];
+        state->tex_shift[1] = params->tex_shift[1];
+        state->tex_lod[1] = params->tex_lod[1];
+
+        if ((params->fbzMode & 1) && (ystart < params->clipLowY))
+        {
+                int dy = params->clipLowY - ystart;
+
+                state->base_r += params->dRdY*dy;
+                state->base_g += params->dGdY*dy;
+                state->base_b += params->dBdY*dy;
+                state->base_a += params->dAdY*dy;
+                state->base_z += params->dZdY*dy;
+                state->tmu[0].base_s += params->tmu[0].dSdY*dy;
+                state->tmu[0].base_t += params->tmu[0].dTdY*dy;
+                state->tmu[0].base_w += params->tmu[0].dWdY*dy;
+                state->tmu[1].base_s += params->tmu[1].dSdY*dy;
+                state->tmu[1].base_t += params->tmu[1].dTdY*dy;
+                state->tmu[1].base_w += params->tmu[1].dWdY*dy;
+                state->base_w += params->dWdY*dy;
+                state->xstart += state->dx1*dy;
+                state->xend   += state->dx2*dy;
+
+                ystart = params->clipLowY;
+        }
+
+        if ((params->fbzMode & 1) && (yend >= params->clipHighY))
+                yend = params->clipHighY;
+
+        state->y = ystart;
+//        yend--;
+
+        if (SLI_ENABLED)
+        {
+                int test_y;
+
+                if (params->fbzMode & (1 << 17))
+                        test_y = (voodoo->v_disp-1) - state->y;
+                else
+                        test_y = state->y;
+
+                if ((!(voodoo->initEnable & INITENABLE_SLI_MASTER_SLAVE) && (test_y & 1)) ||
+                    ((voodoo->initEnable & INITENABLE_SLI_MASTER_SLAVE) && !(test_y & 1)))
+                {
+                        state->y++;
+
+                        state->base_r += params->dRdY;
+                        state->base_g += params->dGdY;
+                        state->base_b += params->dBdY;
+                        state->base_a += params->dAdY;
+                        state->base_z += params->dZdY;
+                        state->tmu[0].base_s += params->tmu[0].dSdY;
+                        state->tmu[0].base_t += params->tmu[0].dTdY;
+                        state->tmu[0].base_w += params->tmu[0].dWdY;
+                        state->tmu[1].base_s += params->tmu[1].dSdY;
+                        state->tmu[1].base_t += params->tmu[1].dTdY;
+                        state->tmu[1].base_w += params->tmu[1].dWdY;
+                        state->base_w += params->dWdY;
+                        state->xstart += state->dx1;
+                        state->xend += state->dx2;
+                }
+        }
+#ifndef NO_CODEGEN
+        if (voodoo->use_recompiler)
+                voodoo_draw = voodoo_get_block(voodoo, params, state, odd_even);
+        else
+                voodoo_draw = NULL;
+#endif
+
+        if (voodoo_output)
+                pclog("dxAB=%08x dxBC=%08x dxAC=%08x\n", state->dxAB, state->dxBC, state->dxAC);
+//        pclog("Start %i %i\n", ystart, voodoo->fbzMode & (1 << 17));
+
+        for (; state->y < yend; state->y += y_diff)
+        {
+                int x, x2;
+                int real_y = (state->y << 4) + 8;
+                int start_x, start_x2;
+                int dx;
+                uint16_t *fb_mem, *aux_mem;
+
+                state->ir = state->base_r;
+                state->ig = state->base_g;
+                state->ib = state->base_b;
+                state->ia = state->base_a;
+                state->z = state->base_z;
+                state->tmu0_s = state->tmu[0].base_s;
+                state->tmu0_t = state->tmu[0].base_t;
+                state->tmu0_w = state->tmu[0].base_w;
+                state->tmu1_s = state->tmu[1].base_s;
+                state->tmu1_t = state->tmu[1].base_t;
+                state->tmu1_w = state->tmu[1].base_w;
+                state->w = state->base_w;
+
+                x = (state->vertexAx << 12) + ((state->dxAC * (real_y - state->vertexAy)) >> 4);
+
+                if (real_y < state->vertexBy)
+                        x2 = (state->vertexAx << 12) + ((state->dxAB * (real_y - state->vertexAy)) >> 4);
+                else
+                        x2 = (state->vertexBx << 12) + ((state->dxBC * (real_y - state->vertexBy)) >> 4);
+
+                if (params->fbzMode & (1 << 17))
+                        real_y = (voodoo->v_disp-1) - (real_y >> 4);
+                else
+                        real_y >>= 4;
+
+                if (SLI_ENABLED)
+                {
+                        if (((real_y >> 1) & voodoo->odd_even_mask) != odd_even)
+                                goto next_line;
+                }
+                else
+                {
+                        if ((real_y & voodoo->odd_even_mask) != odd_even)
+                                goto next_line;
+                }
+
+                start_x = x;
+
+                if (state->xdir > 0)
+                        x2 -= (1 << 16);
+                else
+                        x -= (1 << 16);
+                dx = ((x + 0x7000) >> 16) - (((state->vertexAx << 12) + 0x7000) >> 16);
+                start_x2 = x + 0x7000;
+                x = (x + 0x7000) >> 16;
+                x2 = (x2 + 0x7000) >> 16;
+
+                if (voodoo_output)
+                        pclog("%03i:%03i : Ax=%08x start_x=%08x  dSdX=%016llx dx=%08x  s=%08x -> ", x, state->y, state->vertexAx << 8, start_x, params->tmu[0].dTdX, dx, state->tmu0_t);
+
+                state->ir += (params->dRdX * dx);
+                state->ig += (params->dGdX * dx);
+                state->ib += (params->dBdX * dx);
+                state->ia += (params->dAdX * dx);
+                state->z += (params->dZdX * dx);
+                state->tmu0_s += (params->tmu[0].dSdX * dx);
+                state->tmu0_t += (params->tmu[0].dTdX * dx);
+                state->tmu0_w += (params->tmu[0].dWdX * dx);
+                state->tmu1_s += (params->tmu[1].dSdX * dx);
+                state->tmu1_t += (params->tmu[1].dTdX * dx);
+                state->tmu1_w += (params->tmu[1].dWdX * dx);
+                state->w += (params->dWdX * dx);
+
+                if (voodoo_output)
+                        pclog("%08llx %lli %lli\n", state->tmu0_t, state->tmu0_t >> (18+state->lod), (state->tmu0_t + (1 << (17+state->lod))) >> (18+state->lod));
+
+                if (params->fbzMode & 1)
+                {
+                        if (state->xdir > 0)
+                        {
+                                if (x < params->clipLeft)
+                                {
+                                        int dx = params->clipLeft - x;
+
+                                        state->ir += params->dRdX*dx;
+                                        state->ig += params->dGdX*dx;
+                                        state->ib += params->dBdX*dx;
+                                        state->ia += params->dAdX*dx;
+                                        state->z += params->dZdX*dx;
+                                        state->tmu0_s += params->tmu[0].dSdX*dx;
+                                        state->tmu0_t += params->tmu[0].dTdX*dx;
+                                        state->tmu0_w += params->tmu[0].dWdX*dx;
+                                        state->tmu1_s += params->tmu[1].dSdX*dx;
+                                        state->tmu1_t += params->tmu[1].dTdX*dx;
+                                        state->tmu1_w += params->tmu[1].dWdX*dx;
+                                        state->w += params->dWdX*dx;
+
+                                        x = params->clipLeft;
+                                }
+                                if (x2 >= params->clipRight)
+                                        x2 = params->clipRight-1;
+                        }
+                        else
+                        {
+                                if (x >= params->clipRight)
+                                {
+                                        int dx = (params->clipRight-1) - x;
+
+                                        state->ir += params->dRdX*dx;
+                                        state->ig += params->dGdX*dx;
+                                        state->ib += params->dBdX*dx;
+                                        state->ia += params->dAdX*dx;
+                                        state->z += params->dZdX*dx;
+                                        state->tmu0_s += params->tmu[0].dSdX*dx;
+                                        state->tmu0_t += params->tmu[0].dTdX*dx;
+                                        state->tmu0_w += params->tmu[0].dWdX*dx;
+                                        state->tmu1_s += params->tmu[1].dSdX*dx;
+                                        state->tmu1_t += params->tmu[1].dTdX*dx;
+                                        state->tmu1_w += params->tmu[1].dWdX*dx;
+                                        state->w += params->dWdX*dx;
+
+                                        x = params->clipRight-1;
+                                }
+                                if (x2 < params->clipLeft)
+                                        x2 = params->clipLeft;
+                        }
+                }
+
+                if (x2 < x && state->xdir > 0)
+                        goto next_line;
+                if (x2 > x && state->xdir < 0)
+                        goto next_line;
+
+                if (SLI_ENABLED)
+                {
+                        state->fb_mem = fb_mem = (uint16_t *)&voodoo->fb_mem[params->draw_offset + ((real_y >> 1) * params->row_width)];
+                        state->aux_mem = aux_mem = (uint16_t *)&voodoo->fb_mem[(params->aux_offset + ((real_y >> 1) * params->row_width)) & voodoo->fb_mask];
+                }
+                else
+                {
+                        if (params->col_tiled)
+                                state->fb_mem = fb_mem = (uint16_t *)&voodoo->fb_mem[params->draw_offset + (real_y >> 5) * params->row_width + (real_y & 31) * 128];
+                        else
+                                state->fb_mem = fb_mem = (uint16_t *)&voodoo->fb_mem[params->draw_offset + (real_y * params->row_width)];
+                        if (params->aux_tiled)
+                                state->aux_mem = aux_mem = (uint16_t *)&voodoo->fb_mem[(params->aux_offset + (real_y >> 5) * params->aux_row_width + (real_y & 31) * 128) & voodoo->fb_mask];
+                        else
+                                state->aux_mem = aux_mem = (uint16_t *)&voodoo->fb_mem[(params->aux_offset + (real_y * params->row_width)) & voodoo->fb_mask];
+                }
+
+                if (voodoo_output)
+                        pclog("%03i: x=%08x x2=%08x xstart=%08x xend=%08x dx=%08x start_x2=%08x\n", state->y, x, x2, state->xstart, state->xend, dx, start_x2);
+
+                state->pixel_count = 0;
+                state->texel_count = 0;
+                state->x = x;
+                state->x2 = x2;
+#ifndef NO_CODEGEN
+                if (voodoo->use_recompiler)
+                {
+                        voodoo_draw(state, params, x, real_y);
+                }
+                else
+#endif
+                do
+                {
+                        int x_tiled = (x & 63) | ((x >> 6) * 128*32/2);
+                        start_x = x;
+                        state->x = x;
+                        voodoo->pixel_count[odd_even]++;
+                        voodoo->texel_count[odd_even] += texels;
+                        voodoo->fbiPixelsIn++;
+
+                        if (voodoo_output)
+                                pclog("  X=%03i T=%08x\n", x, state->tmu0_t);
+//                        if (voodoo->fbzMode & FBZ_RGB_WMASK)
+                        {
+                                int update = 1;
+                                uint8_t cother_r, cother_g, cother_b, aother;
+                                uint8_t clocal_r, clocal_g, clocal_b, alocal;
+                                int src_r = 0, src_g = 0, src_b = 0, src_a = 0;
+                                int msel_r, msel_g, msel_b, msel_a;
+                                uint8_t dest_r, dest_g, dest_b, dest_a;
+                                uint16_t dat;
+                                int sel;
+                                int32_t new_depth, w_depth;
+
+                                if (state->w & 0xffff00000000)
+                                        w_depth = 0;
+                                else if (!(state->w & 0xffff0000))
+                                        w_depth = 0xf001;
+                                else
+                                {
+                                        int exp = voodoo_fls((uint16_t)((uint32_t)state->w >> 16));
+                                        int mant = ((~(uint32_t)state->w >> (19 - exp))) & 0xfff;
+                                        w_depth = (exp << 12) + mant + 1;
+                                        if (w_depth > 0xffff)
+                                                w_depth = 0xffff;
+                                }
+
+//                                w_depth = CLAMP16(w_depth);
+
+                                if (params->fbzMode & FBZ_W_BUFFER)
+                                        new_depth = w_depth;
+                                else
+                                        new_depth = CLAMP16(state->z >> 12);
+
+                                if (params->fbzMode & FBZ_DEPTH_BIAS)
+                                        new_depth = CLAMP16(new_depth + (int16_t)params->zaColor);
+
+                                if (params->fbzMode & FBZ_DEPTH_ENABLE)
+                                {
+                                        uint16_t old_depth = voodoo->params.aux_tiled ? aux_mem[x_tiled] : aux_mem[x];
+
+                                        DEPTH_TEST((params->fbzMode & FBZ_DEPTH_SOURCE) ? (params->zaColor & 0xffff) : new_depth);
+                                }
+
+                                dat = voodoo->params.col_tiled ? fb_mem[x_tiled] : fb_mem[x];
+                                dest_r = (dat >> 8) & 0xf8;
+                                dest_g = (dat >> 3) & 0xfc;
+                                dest_b = (dat << 3) & 0xf8;
+                                dest_r |= (dest_r >> 5);
+                                dest_g |= (dest_g >> 6);
+                                dest_b |= (dest_b >> 5);
+                                dest_a = 0xff;
+
+                                if (params->fbzColorPath & FBZCP_TEXTURE_ENABLED)
+                                {
+                                        if ((params->textureMode[0] & TEXTUREMODE_LOCAL_MASK) == TEXTUREMODE_LOCAL || !voodoo->dual_tmus)
+                                        {
+                                                /*TMU0 only sampling local colour or only one TMU, only sample TMU0*/
+                                                voodoo_tmu_fetch(voodoo, params, state, 0, x);
+                                        }
+                                        else if ((params->textureMode[0] & TEXTUREMODE_MASK) == TEXTUREMODE_PASSTHROUGH)
+                                        {
+                                                /*TMU0 in pass-through mode, only sample TMU1*/
+                                                voodoo_tmu_fetch(voodoo, params, state, 1, x);
+
+                                                state->tex_r[0] = state->tex_r[1];
+                                                state->tex_g[0] = state->tex_g[1];
+                                                state->tex_b[0] = state->tex_b[1];
+                                                state->tex_a[0] = state->tex_a[1];
+                                        }
+                                        else
+                                        {
+                                                voodoo_tmu_fetch_and_blend(voodoo, params, state, x);
+                                        }
+
+                                        if ((params->fbzMode & FBZ_CHROMAKEY) &&
+                                                state->tex_r[0] == params->chromaKey_r &&
+                                                state->tex_g[0] == params->chromaKey_g &&
+                                                state->tex_b[0] == params->chromaKey_b)
+                                        {
+                                                voodoo->fbiChromaFail++;
+                                                goto skip_pixel;
+                                        }
+                                }
+
+                                if (voodoo->trexInit1[0] & (1 << 18))
+                                {
+                                        state->tex_r[0] = state->tex_g[0] = 0;
+                                        state->tex_b[0] = voodoo->tmuConfig;
+                                }
+
+                                if (cc_localselect_override)
+                                        sel = (state->tex_a[0] & 0x80) ? 1 : 0;
+                                else
+                                        sel = cc_localselect;
+
+                                if (sel)
+                                {
+                                        clocal_r = (params->color0 >> 16) & 0xff;
+                                        clocal_g = (params->color0 >> 8)  & 0xff;
+                                        clocal_b =  params->color0        & 0xff;
+                                }
+                                else
+                                {
+                                        clocal_r = CLAMP(state->ir >> 12);
+                                        clocal_g = CLAMP(state->ig >> 12);
+                                        clocal_b = CLAMP(state->ib >> 12);
+                                }
+
+                                switch (_rgb_sel)
+                                {
+                                        case CC_LOCALSELECT_ITER_RGB: /*Iterated RGB*/
+                                        cother_r = CLAMP(state->ir >> 12);
+                                        cother_g = CLAMP(state->ig >> 12);
+                                        cother_b = CLAMP(state->ib >> 12);
+                                        break;
+
+                                        case CC_LOCALSELECT_TEX: /*TREX Color Output*/
+                                        cother_r = state->tex_r[0];
+                                        cother_g = state->tex_g[0];
+                                        cother_b = state->tex_b[0];
+                                        break;
+
+                                        case CC_LOCALSELECT_COLOR1: /*Color1 RGB*/
+                                        cother_r = (params->color1 >> 16) & 0xff;
+                                        cother_g = (params->color1 >> 8)  & 0xff;
+                                        cother_b =  params->color1        & 0xff;
+                                        break;
+
+                                        case CC_LOCALSELECT_LFB: /*Linear Frame Buffer*/
+                                        cother_r = src_r;
+                                        cother_g = src_g;
+                                        cother_b = src_b;
+                                        break;
+                                }
+
+                                switch (cca_localselect)
+                                {
+                                        case CCA_LOCALSELECT_ITER_A:
+                                        alocal = CLAMP(state->ia >> 12);
+                                        break;
+
+                                        case CCA_LOCALSELECT_COLOR0:
+                                        alocal = (params->color0 >> 24) & 0xff;
+                                        break;
+
+                                        case CCA_LOCALSELECT_ITER_Z:
+                                        alocal = CLAMP(state->z >> 20);
+                                        break;
+
+                                        default:
+                                        fatal("Bad cca_localselect %i\n", cca_localselect);
+                                        alocal = 0xff;
+                                        break;
+                                }
+
+                                switch (a_sel)
+                                {
+                                        case A_SEL_ITER_A:
+                                        aother = CLAMP(state->ia >> 12);
+                                        break;
+                                        case A_SEL_TEX:
+                                        aother = state->tex_a[0];
+                                        break;
+                                        case A_SEL_COLOR1:
+                                        aother = (params->color1 >> 24) & 0xff;
+                                        break;
+                                        default:
+                                        fatal("Bad a_sel %i\n", a_sel);
+                                        aother = 0;
+                                        break;
+                                }
+
+                                if (cc_zero_other)
+                                {
+                                        src_r = 0;
+                                        src_g = 0;
+                                        src_b = 0;
+                                }
+                                else
+                                {
+                                        src_r = cother_r;
+                                        src_g = cother_g;
+                                        src_b = cother_b;
+                                }
+
+                                if (cca_zero_other)
+                                        src_a = 0;
+                                else
+                                        src_a = aother;
+
+                                if (cc_sub_clocal)
+                                {
+                                        src_r -= clocal_r;
+                                        src_g -= clocal_g;
+                                        src_b -= clocal_b;
+                                }
+
+                                if (cca_sub_clocal)
+                                        src_a -= alocal;
+
+                                switch (cc_mselect)
+                                {
+                                        case CC_MSELECT_ZERO:
+                                        msel_r = 0;
+                                        msel_g = 0;
+                                        msel_b = 0;
+                                        break;
+                                        case CC_MSELECT_CLOCAL:
+                                        msel_r = clocal_r;
+                                        msel_g = clocal_g;
+                                        msel_b = clocal_b;
+                                        break;
+                                        case CC_MSELECT_AOTHER:
+                                        msel_r = aother;
+                                        msel_g = aother;
+                                        msel_b = aother;
+                                        break;
+                                        case CC_MSELECT_ALOCAL:
+                                        msel_r = alocal;
+                                        msel_g = alocal;
+                                        msel_b = alocal;
+                                        break;
+                                        case CC_MSELECT_TEX:
+                                        msel_r = state->tex_a[0];
+                                        msel_g = state->tex_a[0];
+                                        msel_b = state->tex_a[0];
+                                        break;
+                                        case CC_MSELECT_TEXRGB:
+                                        msel_r = state->tex_r[0];
+                                        msel_g = state->tex_g[0];
+                                        msel_b = state->tex_b[0];
+                                        break;
+
+                                        default:
+                                                fatal("Bad cc_mselect %i\n", cc_mselect);
+                                        msel_r = 0;
+                                        msel_g = 0;
+                                        msel_b = 0;
+                                        break;
+                                }
+
+                                switch (cca_mselect)
+                                {
+                                        case CCA_MSELECT_ZERO:
+                                        msel_a = 0;
+                                        break;
+                                        case CCA_MSELECT_ALOCAL:
+                                        msel_a = alocal;
+                                        break;
+                                        case CCA_MSELECT_AOTHER:
+                                        msel_a = aother;
+                                        break;
+                                        case CCA_MSELECT_ALOCAL2:
+                                        msel_a = alocal;
+                                        break;
+                                        case CCA_MSELECT_TEX:
+                                        msel_a = state->tex_a[0];
+                                        break;
+
+                                        default:
+                                                fatal("Bad cca_mselect %i\n", cca_mselect);
+                                        msel_a = 0;
+                                        break;
+                                }
+
+                                if (!cc_reverse_blend)
+                                {
+                                        msel_r ^= 0xff;
+                                        msel_g ^= 0xff;
+                                        msel_b ^= 0xff;
+                                }
+                                msel_r++;
+                                msel_g++;
+                                msel_b++;
+
+                                if (!cca_reverse_blend)
+                                        msel_a ^= 0xff;
+                                msel_a++;
+
+                                src_r = (src_r * msel_r) >> 8;
+                                src_g = (src_g * msel_g) >> 8;
+                                src_b = (src_b * msel_b) >> 8;
+                                src_a = (src_a * msel_a) >> 8;
+
+                                switch (cc_add)
+                                {
+                                        case CC_ADD_CLOCAL:
+                                        src_r += clocal_r;
+                                        src_g += clocal_g;
+                                        src_b += clocal_b;
+                                        break;
+                                        case CC_ADD_ALOCAL:
+                                        src_r += alocal;
+                                        src_g += alocal;
+                                        src_b += alocal;
+                                        break;
+                                        case 0:
+                                        break;
+                                        default:
+                                        fatal("Bad cc_add %i\n", cc_add);
+                                }
+
+                                if (cca_add)
+                                        src_a += alocal;
+
+                                src_r = CLAMP(src_r);
+                                src_g = CLAMP(src_g);
+                                src_b = CLAMP(src_b);
+                                src_a = CLAMP(src_a);
+
+                                if (cc_invert_output)
+                                {
+                                        src_r ^= 0xff;
+                                        src_g ^= 0xff;
+                                        src_b ^= 0xff;
+                                }
+                                if (cca_invert_output)
+                                        src_a ^= 0xff;
+
+                                if (params->fogMode & FOG_ENABLE)
+                                        APPLY_FOG(src_r, src_g, src_b, state->z, state->ia, state->w);
+
+                                if (params->alphaMode & 1)
+                                        ALPHA_TEST(src_a);
+
+                                if (params->alphaMode & (1 << 4))
+                                        ALPHA_BLEND(src_r, src_g, src_b, src_a);
+
+                                if (update)
+                                {
+                                        if (dither)
+                                        {
+                                                if (dither2x2)
+                                                {
+                                                        src_r = dither_rb2x2[src_r][real_y & 1][x & 1];
+                                                        src_g =  dither_g2x2[src_g][real_y & 1][x & 1];
+                                                        src_b = dither_rb2x2[src_b][real_y & 1][x & 1];
+                                                }
+                                                else
+                                                {
+                                                        src_r = dither_rb[src_r][real_y & 3][x & 3];
+                                                        src_g =  dither_g[src_g][real_y & 3][x & 3];
+                                                        src_b = dither_rb[src_b][real_y & 3][x & 3];
+                                                }
+                                        }
+                                        else
+                                        {
+                                                src_r >>= 3;
+                                                src_g >>= 2;
+                                                src_b >>= 3;
+                                        }
+
+                                        if (params->fbzMode & FBZ_RGB_WMASK)
+                                        {
+                                                if (voodoo->params.col_tiled)
+                                                        fb_mem[x_tiled] = src_b | (src_g << 5) | (src_r << 11);
+                                                else
+                                                        fb_mem[x] = src_b | (src_g << 5) | (src_r << 11);
+                                        }
+                                        if ((params->fbzMode & (FBZ_DEPTH_WMASK | FBZ_DEPTH_ENABLE)) == (FBZ_DEPTH_WMASK | FBZ_DEPTH_ENABLE))
+                                        {
+                                                if (voodoo->params.aux_tiled)
+                                                        aux_mem[x_tiled] = new_depth;
+                                                else
+                                                        aux_mem[x] = new_depth;
+                                        }
+                                }
+                        }
+                        voodoo_output &= ~2;
+                        voodoo->fbiPixelsOut++;
+skip_pixel:
+                        if (state->xdir > 0)
+                        {
+                                state->ir += params->dRdX;
+                                state->ig += params->dGdX;
+                                state->ib += params->dBdX;
+                                state->ia += params->dAdX;
+                                state->z += params->dZdX;
+                                state->tmu0_s += params->tmu[0].dSdX;
+                                state->tmu0_t += params->tmu[0].dTdX;
+                                state->tmu0_w += params->tmu[0].dWdX;
+                                state->tmu1_s += params->tmu[1].dSdX;
+                                state->tmu1_t += params->tmu[1].dTdX;
+                                state->tmu1_w += params->tmu[1].dWdX;
+                                state->w += params->dWdX;
+                        }
+                        else
+                        {
+                                state->ir -= params->dRdX;
+                                state->ig -= params->dGdX;
+                                state->ib -= params->dBdX;
+                                state->ia -= params->dAdX;
+                                state->z -= params->dZdX;
+                                state->tmu0_s -= params->tmu[0].dSdX;
+                                state->tmu0_t -= params->tmu[0].dTdX;
+                                state->tmu0_w -= params->tmu[0].dWdX;
+                                state->tmu1_s -= params->tmu[1].dSdX;
+                                state->tmu1_t -= params->tmu[1].dTdX;
+                                state->tmu1_w -= params->tmu[1].dWdX;
+                                state->w -= params->dWdX;
+                        }
+
+                        x += state->xdir;
+                } while (start_x != x2);
+
+                voodoo->pixel_count[odd_even] += state->pixel_count;
+                voodoo->texel_count[odd_even] += state->texel_count;
+                voodoo->fbiPixelsIn += state->pixel_count;
+
+                if (voodoo->params.draw_offset == voodoo->params.front_offset && (real_y >> 1) < 2048)
+                        voodoo->dirty_line[real_y >> 1] = 1;
+
+next_line:
+                if (SLI_ENABLED)
+                {
+                        state->base_r += params->dRdY;
+                        state->base_g += params->dGdY;
+                        state->base_b += params->dBdY;
+                        state->base_a += params->dAdY;
+                        state->base_z += params->dZdY;
+                        state->tmu[0].base_s += params->tmu[0].dSdY;
+                        state->tmu[0].base_t += params->tmu[0].dTdY;
+                        state->tmu[0].base_w += params->tmu[0].dWdY;
+                        state->tmu[1].base_s += params->tmu[1].dSdY;
+                        state->tmu[1].base_t += params->tmu[1].dTdY;
+                        state->tmu[1].base_w += params->tmu[1].dWdY;
+                        state->base_w += params->dWdY;
+                        state->xstart += state->dx1;
+                        state->xend += state->dx2;
+                }
+                state->base_r += params->dRdY;
+                state->base_g += params->dGdY;
+                state->base_b += params->dBdY;
+                state->base_a += params->dAdY;
+                state->base_z += params->dZdY;
+                state->tmu[0].base_s += params->tmu[0].dSdY;
+                state->tmu[0].base_t += params->tmu[0].dTdY;
+                state->tmu[0].base_w += params->tmu[0].dWdY;
+                state->tmu[1].base_s += params->tmu[1].dSdY;
+                state->tmu[1].base_t += params->tmu[1].dTdY;
+                state->tmu[1].base_w += params->tmu[1].dWdY;
+                state->base_w += params->dWdY;
+                state->xstart += state->dx1;
+                state->xend += state->dx2;
+        }
+
+        voodoo->texture_cache[0][params->tex_entry[0]].refcount_r[odd_even]++;
+        voodoo->texture_cache[1][params->tex_entry[1]].refcount_r[odd_even]++;
+}
+
+void voodoo_triangle(voodoo_t *voodoo, voodoo_params_t *params, int odd_even)
+{
+        voodoo_state_t state;
+        int vertexAy_adjusted;
+        int vertexCy_adjusted;
+        int dx, dy;
+
+        uint64_t tempdx, tempdy;
+        uint64_t tempLOD;
+        int LOD;
+        int lodbias;
+
+        voodoo->tri_count++;
+
+        dx = 8 - (params->vertexAx & 0xf);
+        if ((params->vertexAx & 0xf) > 8)
+                dx += 16;
+        dy = 8 - (params->vertexAy & 0xf);
+        if ((params->vertexAy & 0xf) > 8)
+                dy += 16;
+
+/*        pclog("voodoo_triangle %i %i %i : vA %f, %f  vB %f, %f  vC %f, %f f %i,%i %08x %08x %08x,%08x tex=%i,%i fogMode=%08x\n", odd_even, voodoo->params_read_idx[odd_even], voodoo->params_read_idx[odd_even] & PARAM_MASK, (float)params->vertexAx / 16.0, (float)params->vertexAy / 16.0,
+                                                                     (float)params->vertexBx / 16.0, (float)params->vertexBy / 16.0,
+                                                                     (float)params->vertexCx / 16.0, (float)params->vertexCy / 16.0,
+                                                                     (params->fbzColorPath & FBZCP_TEXTURE_ENABLED) ? params->tformat[0] : 0,
+                                                                     (params->fbzColorPath & FBZCP_TEXTURE_ENABLED) ? params->tformat[1] : 0, params->fbzColorPath, params->alphaMode, params->textureMode[0],params->textureMode[1], params->tex_entry[0],params->tex_entry[1], params->fogMode);*/
+
+        state.base_r = params->startR;
+        state.base_g = params->startG;
+        state.base_b = params->startB;
+        state.base_a = params->startA;
+        state.base_z = params->startZ;
+        state.tmu[0].base_s = params->tmu[0].startS;
+        state.tmu[0].base_t = params->tmu[0].startT;
+        state.tmu[0].base_w = params->tmu[0].startW;
+        state.tmu[1].base_s = params->tmu[1].startS;
+        state.tmu[1].base_t = params->tmu[1].startT;
+        state.tmu[1].base_w = params->tmu[1].startW;
+        state.base_w = params->startW;
+
+        if (params->fbzColorPath & FBZ_PARAM_ADJUST)
+        {
+                state.base_r += (dx*params->dRdX + dy*params->dRdY) >> 4;
+                state.base_g += (dx*params->dGdX + dy*params->dGdY) >> 4;
+                state.base_b += (dx*params->dBdX + dy*params->dBdY) >> 4;
+                state.base_a += (dx*params->dAdX + dy*params->dAdY) >> 4;
+                state.base_z += (dx*params->dZdX + dy*params->dZdY) >> 4;
+                state.tmu[0].base_s += (dx*params->tmu[0].dSdX + dy*params->tmu[0].dSdY) >> 4;
+                state.tmu[0].base_t += (dx*params->tmu[0].dTdX + dy*params->tmu[0].dTdY) >> 4;
+                state.tmu[0].base_w += (dx*params->tmu[0].dWdX + dy*params->tmu[0].dWdY) >> 4;
+                state.tmu[1].base_s += (dx*params->tmu[1].dSdX + dy*params->tmu[1].dSdY) >> 4;
+                state.tmu[1].base_t += (dx*params->tmu[1].dTdX + dy*params->tmu[1].dTdY) >> 4;
+                state.tmu[1].base_w += (dx*params->tmu[1].dWdX + dy*params->tmu[1].dWdY) >> 4;
+                state.base_w += (dx*params->dWdX + dy*params->dWdY) >> 4;
+        }
+
+        tris++;
+
+        state.vertexAy = params->vertexAy & ~0xffff0000;
+        if (state.vertexAy & 0x8000)
+                state.vertexAy |= 0xffff0000;
+        state.vertexBy = params->vertexBy & ~0xffff0000;
+        if (state.vertexBy & 0x8000)
+                state.vertexBy |= 0xffff0000;
+        state.vertexCy = params->vertexCy & ~0xffff0000;
+        if (state.vertexCy & 0x8000)
+                state.vertexCy |= 0xffff0000;
+
+        state.vertexAx = params->vertexAx & ~0xffff0000;
+        if (state.vertexAx & 0x8000)
+                state.vertexAx |= 0xffff0000;
+        state.vertexBx = params->vertexBx & ~0xffff0000;
+        if (state.vertexBx & 0x8000)
+                state.vertexBx |= 0xffff0000;
+        state.vertexCx = params->vertexCx & ~0xffff0000;
+        if (state.vertexCx & 0x8000)
+                state.vertexCx |= 0xffff0000;
+
+        vertexAy_adjusted = (state.vertexAy+7) >> 4;
+        vertexCy_adjusted = (state.vertexCy+7) >> 4;
+
+        if (state.vertexBy - state.vertexAy)
+                state.dxAB = (int)((((int64_t)state.vertexBx << 12) - ((int64_t)state.vertexAx << 12)) << 4) / (int)(state.vertexBy - state.vertexAy);
+        else
+                state.dxAB = 0;
+        if (state.vertexCy - state.vertexAy)
+                state.dxAC = (int)((((int64_t)state.vertexCx << 12) - ((int64_t)state.vertexAx << 12)) << 4) / (int)(state.vertexCy - state.vertexAy);
+        else
+                state.dxAC = 0;
+        if (state.vertexCy - state.vertexBy)
+                state.dxBC = (int)((((int64_t)state.vertexCx << 12) - ((int64_t)state.vertexBx << 12)) << 4) / (int)(state.vertexCy - state.vertexBy);
+        else
+                state.dxBC = 0;
+
+        state.lod_min[0] = (params->tLOD[0] & 0x3f) << 6;
+        state.lod_max[0] = ((params->tLOD[0] >> 6) & 0x3f) << 6;
+        if (state.lod_max[0] > 0x800)
+                state.lod_max[0] = 0x800;
+        state.lod_min[1] = (params->tLOD[1] & 0x3f) << 6;
+        state.lod_max[1] = ((params->tLOD[1] >> 6) & 0x3f) << 6;
+        if (state.lod_max[1] > 0x800)
+                state.lod_max[1] = 0x800;
+
+        state.xstart = state.xend = state.vertexAx << 8;
+        state.xdir = params->sign ? -1 : 1;
+
+        state.y = (state.vertexAy + 8) >> 4;
+        state.ydir = 1;
+
+
+        tempdx = (params->tmu[0].dSdX >> 14) * (params->tmu[0].dSdX >> 14) + (params->tmu[0].dTdX >> 14) * (params->tmu[0].dTdX >> 14);
+        tempdy = (params->tmu[0].dSdY >> 14) * (params->tmu[0].dSdY >> 14) + (params->tmu[0].dTdY >> 14) * (params->tmu[0].dTdY >> 14);
+
+        if (tempdx > tempdy)
+                tempLOD = tempdx;
+        else
+                tempLOD = tempdy;
+
+        LOD = (int)(log2((double)tempLOD / (double)(1ULL << 36)) * 256);
+        LOD >>= 2;
+
+        lodbias = (params->tLOD[0] >> 12) & 0x3f;
+        if (lodbias & 0x20)
+                lodbias |= ~0x3f;
+        state.tmu[0].lod = LOD + (lodbias << 6);
+
+
+        tempdx = (params->tmu[1].dSdX >> 14) * (params->tmu[1].dSdX >> 14) + (params->tmu[1].dTdX >> 14) * (params->tmu[1].dTdX >> 14);
+        tempdy = (params->tmu[1].dSdY >> 14) * (params->tmu[1].dSdY >> 14) + (params->tmu[1].dTdY >> 14) * (params->tmu[1].dTdY >> 14);
+
+        if (tempdx > tempdy)
+                tempLOD = tempdx;
+        else
+                tempLOD = tempdy;
+
+        LOD = (int)(log2((double)tempLOD / (double)(1ULL << 36)) * 256);
+        LOD >>= 2;
+
+        lodbias = (params->tLOD[1] >> 12) & 0x3f;
+        if (lodbias & 0x20)
+                lodbias |= ~0x3f;
+        state.tmu[1].lod = LOD + (lodbias << 6);
+
+
+        voodoo_half_triangle(voodoo, params, &state, vertexAy_adjusted, vertexCy_adjusted, odd_even);
+}
+
+
+static void render_thread(void *param, int odd_even)
+{
+        voodoo_t *voodoo = (voodoo_t *)param;
+
+        while (1)
+        {
+                thread_set_event(voodoo->render_not_full_event[odd_even]);
+                thread_wait_event(voodoo->wake_render_thread[odd_even], -1);
+                thread_reset_event(voodoo->wake_render_thread[odd_even]);
+                voodoo->render_voodoo_busy[odd_even] = 1;
+
+                while (!PARAM_EMPTY(odd_even))
+                {
+                        uint64_t start_time = timer_read();
+                        uint64_t end_time;
+                        voodoo_params_t *params = &voodoo->params_buffer[voodoo->params_read_idx[odd_even] & PARAM_MASK];
+
+                        voodoo_triangle(voodoo, params, odd_even);
+
+                        voodoo->params_read_idx[odd_even]++;
+
+                        if (PARAM_ENTRIES(odd_even) > (PARAM_SIZE - 10))
+                                thread_set_event(voodoo->render_not_full_event[odd_even]);
+
+                        end_time = timer_read();
+                        voodoo->render_time[odd_even] += end_time - start_time;
+                }
+
+                voodoo->render_voodoo_busy[odd_even] = 0;
+        }
+}
+
+void voodoo_render_thread_1(void *param)
+{
+        render_thread(param, 0);
+}
+void voodoo_render_thread_2(void *param)
+{
+        render_thread(param, 1);
+}
+void voodoo_render_thread_3(void *param)
+{
+        render_thread(param, 2);
+}
+void voodoo_render_thread_4(void *param)
+{
+        render_thread(param, 3);
+}
+
+void voodoo_queue_triangle(voodoo_t *voodoo, voodoo_params_t *params)
+{
+        voodoo_params_t *params_new = &voodoo->params_buffer[voodoo->params_write_idx & PARAM_MASK];
+
+        while (PARAM_FULL(0) || (voodoo->render_threads >= 2 && PARAM_FULL(1)) ||
+                (voodoo->render_threads == 4 && (PARAM_FULL(2) || PARAM_FULL(3))))
+        {
+                thread_reset_event(voodoo->render_not_full_event[0]);
+                if (voodoo->render_threads >= 2)
+                        thread_reset_event(voodoo->render_not_full_event[1]);
+                if (voodoo->render_threads == 4)
+                {
+                        thread_reset_event(voodoo->render_not_full_event[2]);
+                        thread_reset_event(voodoo->render_not_full_event[3]);
+                }
+                if (PARAM_FULL(0))
+                        thread_wait_event(voodoo->render_not_full_event[0], -1); /*Wait for room in ringbuffer*/
+                if (voodoo->render_threads >= 2 && PARAM_FULL(1))
+                        thread_wait_event(voodoo->render_not_full_event[1], -1); /*Wait for room in ringbuffer*/
+                if (voodoo->render_threads == 4 && PARAM_FULL(2))
+                        thread_wait_event(voodoo->render_not_full_event[2], -1); /*Wait for room in ringbuffer*/
+                if (voodoo->render_threads == 4 && PARAM_FULL(3))
+                        thread_wait_event(voodoo->render_not_full_event[3], -1); /*Wait for room in ringbuffer*/
+        }
+
+        voodoo_use_texture(voodoo, params, 0);
+        if (voodoo->dual_tmus)
+                voodoo_use_texture(voodoo, params, 1);
+
+        memcpy(params_new, params, sizeof(voodoo_params_t));
+
+        voodoo->params_write_idx++;
+
+        if (PARAM_ENTRIES(0) < 4 || (voodoo->render_threads >= 2 && PARAM_ENTRIES(1) < 4) ||
+                        (voodoo->render_threads == 4 && (PARAM_ENTRIES(2) < 4 || PARAM_ENTRIES(3) < 4)))
+                voodoo_wake_render_thread(voodoo);
+}
diff --git a/pcem/vid_voodoo_render.h b/pcem/vid_voodoo_render.h
new file mode 100644 (file)
index 0000000..9ba73dc
--- /dev/null
@@ -0,0 +1,338 @@
+#if !(defined i386 || defined __i386 || defined __i386__ || defined _X86_ || defined WIN32 || defined _WIN32 || defined _WIN32) && !(defined __amd64__)
+#define NO_CODEGEN
+#endif
+
+#ifndef NO_CODEGEN
+void voodoo_codegen_init(voodoo_t *voodoo);
+void voodoo_codegen_close(voodoo_t *voodoo);
+#endif
+
+#define DEPTH_TEST(comp_depth)                          \
+        do                                              \
+        {                                               \
+                switch (depth_op)                       \
+                {                                       \
+                        case DEPTHOP_NEVER:             \
+                        voodoo->fbiZFuncFail++;         \
+                        goto skip_pixel;                \
+                        case DEPTHOP_LESSTHAN:          \
+                        if (!(comp_depth < old_depth))  \
+                        {                               \
+                                voodoo->fbiZFuncFail++; \
+                                goto skip_pixel;        \
+                        }                               \
+                        break;                          \
+                        case DEPTHOP_EQUAL:             \
+                        if (!(comp_depth == old_depth)) \
+                        {                               \
+                                voodoo->fbiZFuncFail++; \
+                                goto skip_pixel;        \
+                        }                               \
+                        break;                          \
+                        case DEPTHOP_LESSTHANEQUAL:     \
+                        if (!(comp_depth <= old_depth)) \
+                        {                               \
+                                voodoo->fbiZFuncFail++; \
+                                goto skip_pixel;        \
+                        }                               \
+                        break;                          \
+                        case DEPTHOP_GREATERTHAN:       \
+                        if (!(comp_depth > old_depth))  \
+                        {                               \
+                                voodoo->fbiZFuncFail++; \
+                                goto skip_pixel;        \
+                        }                               \
+                        break;                          \
+                        case DEPTHOP_NOTEQUAL:          \
+                        if (!(comp_depth != old_depth)) \
+                        {                               \
+                                voodoo->fbiZFuncFail++; \
+                                goto skip_pixel;        \
+                        }                               \
+                        break;                          \
+                        case DEPTHOP_GREATERTHANEQUAL:  \
+                        if (!(comp_depth >= old_depth)) \
+                        {                               \
+                                voodoo->fbiZFuncFail++; \
+                                goto skip_pixel;        \
+                        }                               \
+                        break;                          \
+                        case DEPTHOP_ALWAYS:            \
+                        break;                          \
+                }                                       \
+        } while (0)
+
+#define APPLY_FOG(src_r, src_g, src_b, z, ia, w)                        \
+        do                                                              \
+        {                                                               \
+                if (params->fogMode & FOG_CONSTANT)                     \
+                {                                                       \
+                        src_r += params->fogColor.r;                    \
+                        src_g += params->fogColor.g;                    \
+                        src_b += params->fogColor.b;                    \
+                }                                                       \
+                else                                                    \
+                {                                                       \
+                        int fog_r, fog_g, fog_b, fog_a = 0;             \
+                        int fog_idx;                                    \
+                                                                        \
+                        if (!(params->fogMode & FOG_ADD))               \
+                        {                                               \
+                                fog_r = params->fogColor.r;             \
+                                fog_g = params->fogColor.g;             \
+                                fog_b = params->fogColor.b;             \
+                        }                                               \
+                        else                                            \
+                                fog_r = fog_g = fog_b = 0;              \
+                                                                        \
+                        if (!(params->fogMode & FOG_MULT))              \
+                        {                                               \
+                                fog_r -= src_r;                         \
+                                fog_g -= src_g;                         \
+                                fog_b -= src_b;                         \
+                        }                                               \
+                                                                        \
+                        switch (params->fogMode & (FOG_Z|FOG_ALPHA))    \
+                        {                                               \
+                                case 0:                                 \
+                                fog_idx = (w_depth >> 10) & 0x3f;       \
+                                                                        \
+                                fog_a = params->fogTable[fog_idx].fog;  \
+                                fog_a += (params->fogTable[fog_idx].dfog * ((w_depth >> 2) & 0xff)) >> 10;      \
+                                break;                                  \
+                                case FOG_Z:                             \
+                                fog_a = (z >> 20) & 0xff;               \
+                                break;                                  \
+                                case FOG_ALPHA:                         \
+                                fog_a = CLAMP(ia >> 12);                \
+                                break;                                  \
+                                case FOG_W:                             \
+                                fog_a = CLAMP((w >> 32) & 0xff);        \
+                                break;                                  \
+                        }                                               \
+                        fog_a++;                                        \
+                                                                        \
+                        fog_r = (fog_r * fog_a) >> 8;                   \
+                        fog_g = (fog_g * fog_a) >> 8;                   \
+                        fog_b = (fog_b * fog_a) >> 8;                   \
+                                                                        \
+                        if (params->fogMode & FOG_MULT)                 \
+                        {                                               \
+                                src_r = fog_r;                          \
+                                src_g = fog_g;                          \
+                                src_b = fog_b;                          \
+                        }                                               \
+                        else                                            \
+                        {                                               \
+                                src_r += fog_r;                         \
+                                src_g += fog_g;                         \
+                                src_b += fog_b;                         \
+                        }                                               \
+                }                                                       \
+                                                                        \
+                src_r = CLAMP(src_r);                                   \
+                src_g = CLAMP(src_g);                                   \
+                src_b = CLAMP(src_b);                                   \
+        } while (0)
+
+#define ALPHA_TEST(src_a)                               \
+        do                                              \
+        {                                               \
+                switch (alpha_func)                     \
+                {                                       \
+                        case AFUNC_NEVER:               \
+                        voodoo->fbiAFuncFail++;         \
+                        goto skip_pixel;                \
+                        case AFUNC_LESSTHAN:            \
+                        if (!(src_a < a_ref))           \
+                        {                               \
+                                voodoo->fbiAFuncFail++; \
+                                goto skip_pixel;        \
+                        }                               \
+                        break;                          \
+                        case AFUNC_EQUAL:               \
+                        if (!(src_a == a_ref))          \
+                        {                               \
+                                voodoo->fbiAFuncFail++; \
+                                goto skip_pixel;        \
+                        }                               \
+                        break;                          \
+                        case AFUNC_LESSTHANEQUAL:       \
+                        if (!(src_a <= a_ref))          \
+                        {                               \
+                                voodoo->fbiAFuncFail++; \
+                                goto skip_pixel;        \
+                        }                               \
+                        break;                          \
+                        case AFUNC_GREATERTHAN:         \
+                        if (!(src_a > a_ref))           \
+                        {                               \
+                                voodoo->fbiAFuncFail++; \
+                                goto skip_pixel;        \
+                        }                               \
+                        break;                          \
+                        case AFUNC_NOTEQUAL:            \
+                        if (!(src_a != a_ref))          \
+                        {                               \
+                                voodoo->fbiAFuncFail++; \
+                                goto skip_pixel;        \
+                        }                               \
+                        break;                          \
+                        case AFUNC_GREATERTHANEQUAL:    \
+                        if (!(src_a >= a_ref))          \
+                        {                               \
+                                voodoo->fbiAFuncFail++; \
+                                goto skip_pixel;        \
+                        }                               \
+                        break;                          \
+                        case AFUNC_ALWAYS:              \
+                        break;                          \
+                }                                       \
+        } while (0)
+
+#define ALPHA_BLEND(src_r, src_g, src_b, src_a)                         \
+        do                                                              \
+        {                                                               \
+                int _a;                                                 \
+                int newdest_r = 0, newdest_g = 0, newdest_b = 0;        \
+                                                                        \
+                switch (dest_afunc)                                     \
+                {                                                       \
+                        case AFUNC_AZERO:                               \
+                        newdest_r = newdest_g = newdest_b = 0;          \
+                        break;                                          \
+                        case AFUNC_ASRC_ALPHA:                          \
+                        newdest_r = (dest_r * src_a) / 255;             \
+                        newdest_g = (dest_g * src_a) / 255;             \
+                        newdest_b = (dest_b * src_a) / 255;             \
+                        break;                                          \
+                        case AFUNC_A_COLOR:                             \
+                        newdest_r = (dest_r * src_r) / 255;             \
+                        newdest_g = (dest_g * src_g) / 255;             \
+                        newdest_b = (dest_b * src_b) / 255;             \
+                        break;                                          \
+                        case AFUNC_ADST_ALPHA:                          \
+                        newdest_r = (dest_r * dest_a) / 255;            \
+                        newdest_g = (dest_g * dest_a) / 255;            \
+                        newdest_b = (dest_b * dest_a) / 255;            \
+                        break;                                          \
+                        case AFUNC_AONE:                                \
+                        newdest_r = dest_r;                             \
+                        newdest_g = dest_g;                             \
+                        newdest_b = dest_b;                             \
+                        break;                                          \
+                        case AFUNC_AOMSRC_ALPHA:                        \
+                        newdest_r = (dest_r * (255-src_a)) / 255;       \
+                        newdest_g = (dest_g * (255-src_a)) / 255;       \
+                        newdest_b = (dest_b * (255-src_a)) / 255;       \
+                        break;                                          \
+                        case AFUNC_AOM_COLOR:                           \
+                        newdest_r = (dest_r * (255-src_r)) / 255;       \
+                        newdest_g = (dest_g * (255-src_g)) / 255;       \
+                        newdest_b = (dest_b * (255-src_b)) / 255;       \
+                        break;                                          \
+                        case AFUNC_AOMDST_ALPHA:                        \
+                        newdest_r = (dest_r * (255-dest_a)) / 255;      \
+                        newdest_g = (dest_g * (255-dest_a)) / 255;      \
+                        newdest_b = (dest_b * (255-dest_a)) / 255;      \
+                        break;                                          \
+                        case AFUNC_ASATURATE:                           \
+                        _a = MIN(src_a, 1-dest_a);                      \
+                        newdest_r = (dest_r * _a) / 255;                \
+                        newdest_g = (dest_g * _a) / 255;                \
+                        newdest_b = (dest_b * _a) / 255;                \
+                        break;                                          \
+                }                                                       \
+                                                                        \
+                switch (src_afunc)                                      \
+                {                                                       \
+                        case AFUNC_AZERO:                               \
+                        src_r = src_g = src_b = 0;                      \
+                        break;                                          \
+                        case AFUNC_ASRC_ALPHA:                          \
+                        src_r = (src_r * src_a) / 255;                  \
+                        src_g = (src_g * src_a) / 255;                  \
+                        src_b = (src_b * src_a) / 255;                  \
+                        break;                                          \
+                        case AFUNC_A_COLOR:                             \
+                        src_r = (src_r * dest_r) / 255;                 \
+                        src_g = (src_g * dest_g) / 255;                 \
+                        src_b = (src_b * dest_b) / 255;                 \
+                        break;                                          \
+                        case AFUNC_ADST_ALPHA:                          \
+                        src_r = (src_r * dest_a) / 255;                 \
+                        src_g = (src_g * dest_a) / 255;                 \
+                        src_b = (src_b * dest_a) / 255;                 \
+                        break;                                          \
+                        case AFUNC_AONE:                                \
+                        break;                                          \
+                        case AFUNC_AOMSRC_ALPHA:                        \
+                        src_r = (src_r * (255-src_a)) / 255;            \
+                        src_g = (src_g * (255-src_a)) / 255;            \
+                        src_b = (src_b * (255-src_a)) / 255;            \
+                        break;                                          \
+                        case AFUNC_AOM_COLOR:                           \
+                        src_r = (src_r * (255-dest_r)) / 255;           \
+                        src_g = (src_g * (255-dest_g)) / 255;           \
+                        src_b = (src_b * (255-dest_b)) / 255;           \
+                        break;                                          \
+                        case AFUNC_AOMDST_ALPHA:                        \
+                        src_r = (src_r * (255-dest_a)) / 255;           \
+                        src_g = (src_g * (255-dest_a)) / 255;           \
+                        src_b = (src_b * (255-dest_a)) / 255;           \
+                        break;                                          \
+                        case AFUNC_ACOLORBEFOREFOG:                     \
+                        fatal("AFUNC_ACOLORBEFOREFOG\n"); \
+                        break;                                          \
+                }                                                       \
+                                                                        \
+                src_r += newdest_r;                                     \
+                src_g += newdest_g;                                     \
+                src_b += newdest_b;                                     \
+                                                                        \
+                src_r = CLAMP(src_r);                                   \
+                src_g = CLAMP(src_g);                                   \
+                src_b = CLAMP(src_b);                                   \
+        } while(0)
+
+
+
+void voodoo_render_thread_1(void *param);
+void voodoo_render_thread_2(void *param);
+void voodoo_render_thread_3(void *param);
+void voodoo_render_thread_4(void *param);
+void voodoo_queue_triangle(voodoo_t *voodoo, voodoo_params_t *params);
+
+extern int voodoo_recomp;
+extern int tris;
+
+static inline void voodoo_wake_render_thread(voodoo_t *voodoo)
+{
+        thread_set_event(voodoo->wake_render_thread[0]); /*Wake up render thread if moving from idle*/
+        if (voodoo->render_threads >= 2)
+                thread_set_event(voodoo->wake_render_thread[1]); /*Wake up render thread if moving from idle*/
+        if (voodoo->render_threads == 4)
+        {
+                thread_set_event(voodoo->wake_render_thread[2]); /*Wake up render thread if moving from idle*/
+                thread_set_event(voodoo->wake_render_thread[3]); /*Wake up render thread if moving from idle*/
+        }
+}
+
+static inline void voodoo_wait_for_render_thread_idle(voodoo_t *voodoo)
+{
+        while (!PARAM_EMPTY(0) || (voodoo->render_threads >= 2 && !PARAM_EMPTY(1)) ||
+                (voodoo->render_threads == 4 && (!PARAM_EMPTY(2) || !PARAM_EMPTY(3))) ||
+                voodoo->render_voodoo_busy[0] || (voodoo->render_threads >= 2 && voodoo->render_voodoo_busy[1]) ||
+                (voodoo->render_threads == 4 && (voodoo->render_voodoo_busy[2] || voodoo->render_voodoo_busy[3])))
+        {
+                voodoo_wake_render_thread(voodoo);
+                if (!PARAM_EMPTY(0) || voodoo->render_voodoo_busy[0])
+                        thread_wait_event(voodoo->render_not_full_event[0], 1);
+                if (voodoo->render_threads >= 2 && (!PARAM_EMPTY(1) || voodoo->render_voodoo_busy[1]))
+                        thread_wait_event(voodoo->render_not_full_event[1], 1);
+                if (voodoo->render_threads == 4 && (!PARAM_EMPTY(2) || voodoo->render_voodoo_busy[2]))
+                        thread_wait_event(voodoo->render_not_full_event[2], 1);
+                if (voodoo->render_threads == 4 && (!PARAM_EMPTY(3) || voodoo->render_voodoo_busy[3]))
+                        thread_wait_event(voodoo->render_not_full_event[3], 1);
+        }
+}
diff --git a/pcem/vid_voodoo_setup.cpp b/pcem/vid_voodoo_setup.cpp
new file mode 100644 (file)
index 0000000..360d14a
--- /dev/null
@@ -0,0 +1,216 @@
+#include "ibm.h"
+#include "device.h"
+#include "mem.h"
+#include "thread.h"
+#include "video.h"
+#include "vid_svga.h"
+#include "vid_voodoo.h"
+#include "vid_voodoo_common.h"
+#include "vid_voodoo_regs.h"
+#include "vid_voodoo_render.h"
+#include "vid_voodoo_setup.h"
+
+void voodoo_triangle_setup(voodoo_t *voodoo)
+{
+        float dxAB, dxBC, dyAB, dyBC;
+        float area;
+        int va = 0, vb = 1, vc = 2;
+        vert_t verts[3];
+
+        verts[0] = voodoo->verts[0];
+        verts[1] = voodoo->verts[1];
+        verts[2] = voodoo->verts[2];
+
+        if (verts[0].sVy < verts[1].sVy)
+        {
+                if (verts[1].sVy < verts[2].sVy)
+                {
+                        /* V1>V0, V2>V1, V2>V1>V0*/
+                        va = 0; /*OK*/
+                        vb = 1;
+                        vc = 2;
+                }
+                else
+                {
+                        /* V1>V0, V1>V2*/
+                        if (verts[0].sVy < verts[2].sVy)
+                        {
+                                /* V1>V0, V1>V2, V2>V0, V1>V2>V0*/
+                                va = 0;
+                                vb = 2;
+                                vc = 1;
+                        }
+                        else
+                        {
+                                /* V1>V0, V1>V2, V0>V2, V1>V0>V2*/
+                                va = 2;
+                                vb = 0;
+                                vc = 1;
+                        }
+                }
+        }
+        else
+        {
+                if (verts[1].sVy < verts[2].sVy)
+                {
+                        /* V0>V1, V2>V1*/
+                        if (verts[0].sVy < verts[2].sVy)
+                        {
+                                /* V0>V1, V2>V1, V2>V0, V2>V0>V1*/
+                                va = 1;
+                                vb = 0;
+                                vc = 2;
+                        }
+                        else
+                        {
+                                /* V0>V1, V2>V1, V0>V2, V0>V2>V1*/
+                                va = 1;
+                                vb = 2;
+                                vc = 0;
+                        }
+                }
+                else
+                {
+                        /*V0>V1>V2*/
+                        va = 2;
+                        vb = 1;
+                        vc = 0;
+                }
+        }
+
+        dxAB = verts[0].sVx - verts[1].sVx;
+        dxBC = verts[1].sVx - verts[2].sVx;
+        dyAB = verts[0].sVy - verts[1].sVy;
+        dyBC = verts[1].sVy - verts[2].sVy;
+
+        area = dxAB * dyBC - dxBC * dyAB;
+
+        if (area == 0.0)
+                return;
+
+        if (voodoo->sSetupMode & SETUPMODE_CULLING_ENABLE)
+        {
+                int cull_sign = voodoo->sSetupMode & SETUPMODE_CULLING_SIGN;
+                int sign = (area < 0.0);
+                
+                if ((voodoo->sSetupMode & (SETUPMODE_CULLING_ENABLE | SETUPMODE_DISABLE_PINGPONG))
+                                == SETUPMODE_CULLING_ENABLE && voodoo->cull_pingpong)
+                        cull_sign = !cull_sign;
+
+                if (cull_sign && sign)
+                        return;
+                if (!cull_sign && !sign)
+                        return;
+        }
+
+
+        dxAB = verts[va].sVx - verts[vb].sVx;
+        dxBC = verts[vb].sVx - verts[vc].sVx;
+        dyAB = verts[va].sVy - verts[vb].sVy;
+        dyBC = verts[vb].sVy - verts[vc].sVy;
+
+        area = dxAB * dyBC - dxBC * dyAB;
+
+        dxAB /= area;
+        dxBC /= area;
+        dyAB /= area;
+        dyBC /= area;
+
+
+
+        voodoo->params.vertexAx = (int32_t)(int16_t)((int32_t)(verts[va].sVx * 16.0f) & 0xffff);
+        voodoo->params.vertexAy = (int32_t)(int16_t)((int32_t)(verts[va].sVy * 16.0f) & 0xffff);
+        voodoo->params.vertexBx = (int32_t)(int16_t)((int32_t)(verts[vb].sVx * 16.0f) & 0xffff);
+        voodoo->params.vertexBy = (int32_t)(int16_t)((int32_t)(verts[vb].sVy * 16.0f) & 0xffff);
+        voodoo->params.vertexCx = (int32_t)(int16_t)((int32_t)(verts[vc].sVx * 16.0f) & 0xffff);
+        voodoo->params.vertexCy = (int32_t)(int16_t)((int32_t)(verts[vc].sVy * 16.0f) & 0xffff);
+
+        if (voodoo->params.vertexAy > voodoo->params.vertexBy || voodoo->params.vertexBy > voodoo->params.vertexCy)
+        {
+                pclog("triangle_setup wrong order %d %d %d\n", voodoo->params.vertexAy, voodoo->params.vertexBy, voodoo->params.vertexCy);
+                return;
+        }
+
+        if (voodoo->sSetupMode & SETUPMODE_RGB)
+        {
+                voodoo->params.startR = (int32_t)(verts[va].sRed * 4096.0f);
+                voodoo->params.dRdX = (int32_t)(((verts[va].sRed - verts[vb].sRed) * dyBC - (verts[vb].sRed - verts[vc].sRed) * dyAB) * 4096.0f);
+                voodoo->params.dRdY = (int32_t)(((verts[vb].sRed - verts[vc].sRed) * dxAB - (verts[va].sRed - verts[vb].sRed) * dxBC) * 4096.0f);
+                voodoo->params.startG = (int32_t)(verts[va].sGreen * 4096.0f);
+                voodoo->params.dGdX = (int32_t)(((verts[va].sGreen - verts[vb].sGreen) * dyBC - (verts[vb].sGreen - verts[vc].sGreen) * dyAB) * 4096.0f);
+                voodoo->params.dGdY = (int32_t)(((verts[vb].sGreen - verts[vc].sGreen) * dxAB - (verts[va].sGreen - verts[vb].sGreen) * dxBC) * 4096.0f);
+                voodoo->params.startB = (int32_t)(verts[va].sBlue * 4096.0f);
+                voodoo->params.dBdX = (int32_t)(((verts[va].sBlue - verts[vb].sBlue) * dyBC - (verts[vb].sBlue - verts[vc].sBlue) * dyAB) * 4096.0f);
+                voodoo->params.dBdY = (int32_t)(((verts[vb].sBlue - verts[vc].sBlue) * dxAB - (verts[va].sBlue - verts[vb].sBlue) * dxBC) * 4096.0f);
+        }
+        if (voodoo->sSetupMode & SETUPMODE_ALPHA)
+        {
+                voodoo->params.startA = (int32_t)(verts[va].sAlpha * 4096.0f);
+                voodoo->params.dAdX = (int32_t)(((verts[va].sAlpha - verts[vb].sAlpha) * dyBC - (verts[vb].sAlpha - verts[vc].sAlpha) * dyAB) * 4096.0f);
+                voodoo->params.dAdY = (int32_t)(((verts[vb].sAlpha - verts[vc].sAlpha) * dxAB - (verts[va].sAlpha - verts[vb].sAlpha) * dxBC) * 4096.0f);
+        }
+        if (voodoo->sSetupMode & SETUPMODE_Z)
+        {
+                voodoo->params.startZ = (int32_t)(verts[va].sVz * 4096.0f);
+                voodoo->params.dZdX = (int32_t)(((verts[va].sVz - verts[vb].sVz) * dyBC - (verts[vb].sVz - verts[vc].sVz) * dyAB) * 4096.0f);
+                voodoo->params.dZdY = (int32_t)(((verts[vb].sVz - verts[vc].sVz) * dxAB - (verts[va].sVz - verts[vb].sVz) * dxBC) * 4096.0f);
+        }
+        if (voodoo->sSetupMode & SETUPMODE_Wb)
+        {
+                voodoo->params.startW = (int64_t)(verts[va].sWb * 4294967296.0f);
+                voodoo->params.dWdX = (int64_t)(((verts[va].sWb - verts[vb].sWb) * dyBC - (verts[vb].sWb - verts[vc].sWb) * dyAB) * 4294967296.0f);
+                voodoo->params.dWdY = (int64_t)(((verts[vb].sWb - verts[vc].sWb) * dxAB - (verts[va].sWb - verts[vb].sWb) * dxBC) * 4294967296.0f);
+                voodoo->params.tmu[0].startW = voodoo->params.tmu[1].startW = voodoo->params.startW;
+                voodoo->params.tmu[0].dWdX = voodoo->params.tmu[1].dWdX = voodoo->params.dWdX;
+                voodoo->params.tmu[0].dWdY = voodoo->params.tmu[1].dWdY = voodoo->params.dWdY;
+        }
+        if (voodoo->sSetupMode & SETUPMODE_W0)
+        {
+                voodoo->params.tmu[0].startW = (int64_t)(verts[va].sW0 * 4294967296.0f);
+                voodoo->params.tmu[0].dWdX = (int64_t)(((verts[va].sW0 - verts[vb].sW0) * dyBC - (verts[vb].sW0 - verts[vc].sW0) * dyAB) * 4294967296.0f);
+                voodoo->params.tmu[0].dWdY = (int64_t)(((verts[vb].sW0 - verts[vc].sW0) * dxAB - (verts[va].sW0 - verts[vb].sW0) * dxBC) * 4294967296.0f);
+                voodoo->params.tmu[1].startW = voodoo->params.tmu[0].startW;
+                voodoo->params.tmu[1].dWdX = voodoo->params.tmu[0].dWdX;
+                voodoo->params.tmu[1].dWdY = voodoo->params.tmu[0].dWdY;
+        }
+        if (voodoo->sSetupMode & SETUPMODE_S0_T0)
+        {
+                voodoo->params.tmu[0].startS = (int64_t)(verts[va].sS0 * 4294967296.0f);
+                voodoo->params.tmu[0].dSdX = (int64_t)(((verts[va].sS0 - verts[vb].sS0) * dyBC - (verts[vb].sS0 - verts[vc].sS0) * dyAB) * 4294967296.0f);
+                voodoo->params.tmu[0].dSdY = (int64_t)(((verts[vb].sS0 - verts[vc].sS0) * dxAB - (verts[va].sS0 - verts[vb].sS0) * dxBC) * 4294967296.0f);
+                voodoo->params.tmu[0].startT = (int64_t)(verts[va].sT0 * 4294967296.0f);
+                voodoo->params.tmu[0].dTdX = (int64_t)(((verts[va].sT0 - verts[vb].sT0) * dyBC - (verts[vb].sT0 - verts[vc].sT0) * dyAB) * 4294967296.0f);
+                voodoo->params.tmu[0].dTdY = (int64_t)(((verts[vb].sT0 - verts[vc].sT0) * dxAB - (verts[va].sT0 - verts[vb].sT0) * dxBC) * 4294967296.0f);
+                voodoo->params.tmu[1].startS = voodoo->params.tmu[0].startS;
+                voodoo->params.tmu[1].dSdX = voodoo->params.tmu[0].dSdX;
+                voodoo->params.tmu[1].dSdY = voodoo->params.tmu[0].dSdY;
+                voodoo->params.tmu[1].startT = voodoo->params.tmu[0].startT;
+                voodoo->params.tmu[1].dTdX = voodoo->params.tmu[0].dTdX;
+                voodoo->params.tmu[1].dTdY = voodoo->params.tmu[0].dTdY;
+        }
+        if (voodoo->sSetupMode & SETUPMODE_W1)
+        {
+                voodoo->params.tmu[1].startW = (int64_t)(verts[va].sW1 * 4294967296.0f);
+                voodoo->params.tmu[1].dWdX = (int64_t)(((verts[va].sW1 - verts[vb].sW1) * dyBC - (verts[vb].sW1 - verts[vc].sW1) * dyAB) * 4294967296.0f);
+                voodoo->params.tmu[1].dWdY = (int64_t)(((verts[vb].sW1 - verts[vc].sW1) * dxAB - (verts[va].sW1 - verts[vb].sW1) * dxBC) * 4294967296.0f);
+        }
+        if (voodoo->sSetupMode & SETUPMODE_S1_T1)
+        {
+                voodoo->params.tmu[1].startS = (int64_t)(verts[va].sS1 * 4294967296.0f);
+                voodoo->params.tmu[1].dSdX = (int64_t)(((verts[va].sS1 - verts[vb].sS1) * dyBC - (verts[vb].sS1 - verts[vc].sS1) * dyAB) * 4294967296.0f);
+                voodoo->params.tmu[1].dSdY = (int64_t)(((verts[vb].sS1 - verts[vc].sS1) * dxAB - (verts[va].sS1 - verts[vb].sS1) * dxBC) * 4294967296.0f);
+                voodoo->params.tmu[1].startT = (int64_t)(verts[va].sT1 * 4294967296.0f);
+                voodoo->params.tmu[1].dTdX = (int64_t)(((verts[va].sT1 - verts[vb].sT1) * dyBC - (verts[vb].sT1 - verts[vc].sT1) * dyAB) * 4294967296.0f);
+                voodoo->params.tmu[1].dTdY = (int64_t)(((verts[vb].sT1 - verts[vc].sT1) * dxAB - (verts[va].sT1 - verts[vb].sT1) * dxBC) * 4294967296.0f);
+        }
+
+        voodoo->params.sign = (area < 0.0);
+
+        if (voodoo->ncc_dirty[0])
+                voodoo_update_ncc(voodoo, 0);
+        if (voodoo->ncc_dirty[1])
+                voodoo_update_ncc(voodoo, 1);
+        voodoo->ncc_dirty[0] = voodoo->ncc_dirty[1] = 0;
+
+        voodoo_queue_triangle(voodoo, &voodoo->params);
+}
diff --git a/pcem/vid_voodoo_setup.h b/pcem/vid_voodoo_setup.h
new file mode 100644 (file)
index 0000000..06d2f8d
--- /dev/null
@@ -0,0 +1 @@
+void voodoo_triangle_setup(voodoo_t *voodoo);
diff --git a/pcem/vid_voodoo_texture.cpp b/pcem/vid_voodoo_texture.cpp
new file mode 100644 (file)
index 0000000..f1d5c78
--- /dev/null
@@ -0,0 +1,583 @@
+#include <math.h>
+#include <stddef.h>
+#include "ibm.h"
+#include "device.h"
+#include "mem.h"
+#include "thread.h"
+#include "video.h"
+#include "vid_svga.h"
+#include "vid_voodoo.h"
+#include "vid_voodoo_common.h"
+#include "vid_voodoo_dither.h"
+#include "vid_voodoo_regs.h"
+#include "vid_voodoo_render.h"
+#include "vid_voodoo_texture.h"
+
+void voodoo_recalc_tex(voodoo_t *voodoo, int tmu)
+{
+        int aspect = (voodoo->params.tLOD[tmu] >> 21) & 3;
+        int width = 256, height = 256;
+        int shift = 8;
+        int lod;
+        uint32_t base = voodoo->params.texBaseAddr[tmu];
+        uint32_t offset = 0;
+        int tex_lod = 0;
+        uint32_t offsets[LOD_MAX+3];
+        int widths[LOD_MAX+3], heights[LOD_MAX+3], shifts[LOD_MAX+3];
+
+        if (voodoo->params.tLOD[tmu] & LOD_S_IS_WIDER)
+                height >>= aspect;
+        else
+        {
+                width >>= aspect;
+                shift -= aspect;
+        }
+
+        for (lod = 0; lod <= LOD_MAX + 2; lod++)
+        {
+                offsets[lod] = offset;
+                widths[lod] = width >> lod;
+                heights[lod] = height >> lod;
+                shifts[lod] = shift - lod;
+
+                if (!widths[lod])
+                        widths[lod] = 1;
+                if (!heights[lod])
+                        heights[lod] = 1;
+                if (shifts[lod] < 0)
+                        shifts[lod] = 0;
+
+                if (!(voodoo->params.tLOD[tmu] & LOD_SPLIT) ||
+                                ((lod & 1) && (voodoo->params.tLOD[tmu] & LOD_ODD)) ||
+                                (!(lod & 1) && !(voodoo->params.tLOD[tmu] & LOD_ODD)))
+                {
+                        if (voodoo->params.tformat[tmu] & 8)
+                                offset += (width >> lod) * (height >> lod) * 2;
+                        else
+                                offset += (width >> lod) * (height >> lod);
+                }
+        }
+
+
+        if ((voodoo->params.textureMode[tmu] & TEXTUREMODE_TRILINEAR) && (voodoo->params.tLOD[tmu] & LOD_ODD))
+                tex_lod++; /*Skip LOD 0*/
+
+//        pclog("TMU %i:    %08x\n", tmu, voodoo->params.textureMode[tmu]);
+        for (lod = 0; lod <= LOD_MAX+1; lod++)
+        {
+                if (voodoo->params.tLOD[tmu] & LOD_TMULTIBASEADDR)
+                {
+                        switch (tex_lod)
+                        {
+                                case 0:
+                                base = voodoo->params.texBaseAddr[tmu];
+                                break;
+                                case 1:
+                                base = voodoo->params.texBaseAddr1[tmu];
+                                break;
+                                case 2:
+                                base = voodoo->params.texBaseAddr2[tmu];
+                                break;
+                                default:
+                                base = voodoo->params.texBaseAddr38[tmu];
+                                break;
+                        }
+                }
+
+                voodoo->params.tex_base[tmu][lod] = base + offsets[tex_lod];
+                if (voodoo->params.tformat[tmu] & 8)
+                        voodoo->params.tex_end[tmu][lod] = base + offsets[tex_lod] + (widths[tex_lod] * heights[tex_lod] * 2);
+                else
+                        voodoo->params.tex_end[tmu][lod] = base + offsets[tex_lod] + (widths[tex_lod] * heights[tex_lod]);
+                voodoo->params.tex_w_mask[tmu][lod] = widths[tex_lod] - 1;
+                voodoo->params.tex_w_nmask[tmu][lod] = ~(widths[tex_lod] - 1);
+                voodoo->params.tex_h_mask[tmu][lod] = heights[tex_lod] - 1;
+                voodoo->params.tex_shift[tmu][lod] = shifts[tex_lod];
+                voodoo->params.tex_lod[tmu][lod] = tex_lod;
+
+                if (!(voodoo->params.textureMode[tmu] & TEXTUREMODE_TRILINEAR) ||
+                                ((lod & 1) && (voodoo->params.tLOD[tmu] & LOD_ODD)) ||
+                                (!(lod & 1) && !(voodoo->params.tLOD[tmu] & LOD_ODD)))
+                {
+                        if (!(voodoo->params.tLOD[tmu] & LOD_ODD) || lod != 0)
+                        {
+                                if (voodoo->params.textureMode[tmu] & TEXTUREMODE_TRILINEAR)
+                                        tex_lod += 2;
+                                else
+                                        tex_lod++;
+                        }
+                }
+        }
+
+        voodoo->params.tex_width[tmu] = width;
+}
+
+#define makergba(r, g, b, a)  ((b) | ((g) << 8) | ((r) << 16) | ((a) << 24))
+
+void voodoo_use_texture(voodoo_t *voodoo, voodoo_params_t *params, int tmu)
+{
+        int c, d;
+        int lod;
+        int lod_min, lod_max;
+        uint32_t addr = 0, addr_end;
+        uint32_t palette_checksum;
+
+        lod_min = (params->tLOD[tmu] >> 2) & 15;
+        lod_max = (params->tLOD[tmu] >> 8) & 15;
+
+        if (params->tformat[tmu] == TEX_PAL8 || params->tformat[tmu] == TEX_APAL8 || params->tformat[tmu] == TEX_APAL88)
+        {
+                if (voodoo->palette_dirty[tmu])
+                {
+                        palette_checksum = 0;
+
+                        for (c = 0; c < 256; c++)
+                                palette_checksum ^= voodoo->palette[tmu][c].u;
+
+                        voodoo->palette_checksum[tmu] = palette_checksum;
+                        voodoo->palette_dirty[tmu] = 0;
+                }
+                else
+                        palette_checksum = voodoo->palette_checksum[tmu];
+        }
+        else
+                palette_checksum = 0;
+
+        if ((voodoo->params.tLOD[tmu] & LOD_SPLIT) && (voodoo->params.tLOD[tmu] & LOD_ODD) && (voodoo->params.tLOD[tmu] & LOD_TMULTIBASEADDR))
+                addr = params->texBaseAddr1[tmu];
+        else
+                addr = params->texBaseAddr[tmu];
+
+        /*Try to find texture in cache*/
+        for (c = 0; c < TEX_CACHE_MAX; c++)
+        {
+                if (voodoo->texture_cache[tmu][c].base == addr &&
+                    voodoo->texture_cache[tmu][c].tLOD == (params->tLOD[tmu] & 0xf00fff) &&
+                    voodoo->texture_cache[tmu][c].palette_checksum == palette_checksum)
+                {
+                        params->tex_entry[tmu] = c;
+                        voodoo->texture_cache[tmu][c].refcount++;
+                        return;
+                }
+        }
+        
+        /*Texture not found, search for unused texture*/
+        do
+        {
+                for (c = 0; c < TEX_CACHE_MAX; c++)
+                {
+                        voodoo->texture_last_removed++;
+                        voodoo->texture_last_removed &= (TEX_CACHE_MAX-1);
+                        if (voodoo->texture_cache[tmu][voodoo->texture_last_removed].refcount == voodoo->texture_cache[tmu][voodoo->texture_last_removed].refcount_r[0] &&
+                            (voodoo->render_threads == 1 || voodoo->texture_cache[tmu][voodoo->texture_last_removed].refcount == voodoo->texture_cache[tmu][voodoo->texture_last_removed].refcount_r[1]))
+                                break;
+                }
+                if (c == TEX_CACHE_MAX)
+                        voodoo_wait_for_render_thread_idle(voodoo);
+        } while (c == TEX_CACHE_MAX);
+        if (c == TEX_CACHE_MAX)
+                fatal("Texture cache full!\n");
+
+        c = voodoo->texture_last_removed;
+
+
+        if ((voodoo->params.tLOD[tmu] & LOD_SPLIT) && (voodoo->params.tLOD[tmu] & LOD_ODD) && (voodoo->params.tLOD[tmu] & LOD_TMULTIBASEADDR))
+                voodoo->texture_cache[tmu][c].base = params->texBaseAddr1[tmu];
+        else
+                voodoo->texture_cache[tmu][c].base = params->texBaseAddr[tmu];
+        voodoo->texture_cache[tmu][c].tLOD = params->tLOD[tmu] & 0xf00fff;
+
+        lod_min = (params->tLOD[tmu] >> 2) & 15;
+        lod_max = (params->tLOD[tmu] >> 8) & 15;
+//        pclog("  add new texture to %i tformat=%i %08x LOD=%i-%i tmu=%i\n", c, voodoo->params.tformat[tmu], params->texBaseAddr[tmu], lod_min, lod_max, tmu);
+        lod_min = MIN(lod_min, 8);
+        lod_max = MIN(lod_max, 8);
+        for (lod = lod_min; lod <= lod_max; lod++)
+        {
+                uint32_t *base = &voodoo->texture_cache[tmu][c].data[texture_offset[lod]];
+                uint32_t tex_addr = params->tex_base[tmu][lod] & voodoo->texture_mask;
+                int x, y;
+                int shift = 8 - params->tex_lod[tmu][lod];
+                rgba_u *pal;
+
+                //pclog("  LOD %i : %08x - %08x %i %i,%i\n", lod, params->tex_base[tmu][lod] & voodoo->texture_mask, addr, voodoo->params.tformat[tmu], voodoo->params.tex_w_mask[tmu][lod],voodoo->params.tex_h_mask[tmu][lod]);
+
+
+                switch (params->tformat[tmu])
+                {
+                        case TEX_RGB332:
+                        for (y = 0; y < voodoo->params.tex_h_mask[tmu][lod]+1; y++)
+                        {
+                                for (x = 0; x < voodoo->params.tex_w_mask[tmu][lod]+1; x++)
+                                {
+                                        uint8_t dat = voodoo->tex_mem[tmu][(tex_addr+x) & voodoo->texture_mask];
+
+                                        base[x] = makergba(rgb332[dat].r, rgb332[dat].g, rgb332[dat].b, 0xff);
+                                }
+                                tex_addr += (1 << voodoo->params.tex_shift[tmu][lod]);
+                                base += (1 << shift);
+                        }
+                        break;
+
+                        case TEX_Y4I2Q2:
+                        pal = voodoo->ncc_lookup[tmu][(voodoo->params.textureMode[tmu] & TEXTUREMODE_NCC_SEL) ? 1 : 0];
+                        for (y = 0; y < voodoo->params.tex_h_mask[tmu][lod]+1; y++)
+                        {
+                                for (x = 0; x < voodoo->params.tex_w_mask[tmu][lod]+1; x++)
+                                {
+                                        uint8_t dat = voodoo->tex_mem[tmu][(tex_addr+x) & voodoo->texture_mask];
+
+                                        base[x] = makergba(pal[dat].rgba.r, pal[dat].rgba.g, pal[dat].rgba.b, 0xff);
+                                }
+                                tex_addr += (1 << voodoo->params.tex_shift[tmu][lod]);
+                                base += (1 << shift);
+                        }
+                        break;
+
+                        case TEX_A8:
+                        for (y = 0; y < voodoo->params.tex_h_mask[tmu][lod]+1; y++)
+                        {
+                                for (x = 0; x < voodoo->params.tex_w_mask[tmu][lod]+1; x++)
+                                {
+                                        uint8_t dat = voodoo->tex_mem[tmu][(tex_addr+x) & voodoo->texture_mask];
+
+                                        base[x] = makergba(dat, dat, dat, dat);
+                                }
+                                tex_addr += (1 << voodoo->params.tex_shift[tmu][lod]);
+                                base += (1 << shift);
+                        }
+                        break;
+
+                        case TEX_I8:
+                        for (y = 0; y < voodoo->params.tex_h_mask[tmu][lod]+1; y++)
+                        {
+                                for (x = 0; x < voodoo->params.tex_w_mask[tmu][lod]+1; x++)
+                                {
+                                        uint8_t dat = voodoo->tex_mem[tmu][(tex_addr+x) & voodoo->texture_mask];
+
+                                        base[x] = makergba(dat, dat, dat, 0xff);
+                                }
+                                tex_addr += (1 << voodoo->params.tex_shift[tmu][lod]);
+                                base += (1 << shift);
+                        }
+                        break;
+
+                        case TEX_AI8:
+                        for (y = 0; y < voodoo->params.tex_h_mask[tmu][lod]+1; y++)
+                        {
+                                for (x = 0; x < voodoo->params.tex_w_mask[tmu][lod]+1; x++)
+                                {
+                                        uint8_t dat = voodoo->tex_mem[tmu][(tex_addr+x) & voodoo->texture_mask];
+
+                                        base[x] = makergba((dat & 0x0f) | ((dat << 4) & 0xf0), (dat & 0x0f) | ((dat << 4) & 0xf0), (dat & 0x0f) | ((dat << 4) & 0xf0), (dat & 0xf0) | ((dat >> 4) & 0x0f));
+                                }
+                                tex_addr += (1 << voodoo->params.tex_shift[tmu][lod]);
+                                base += (1 << shift);
+                        }
+                        break;
+
+                        case TEX_PAL8:
+                        pal = voodoo->palette[tmu];
+                        for (y = 0; y < voodoo->params.tex_h_mask[tmu][lod]+1; y++)
+                        {
+                                for (x = 0; x < voodoo->params.tex_w_mask[tmu][lod]+1; x++)
+                                {
+                                        uint8_t dat = voodoo->tex_mem[tmu][(tex_addr+x) & voodoo->texture_mask];
+
+                                        base[x] = makergba(pal[dat].rgba.r, pal[dat].rgba.g, pal[dat].rgba.b, 0xff);
+                                }
+                                tex_addr += (1 << voodoo->params.tex_shift[tmu][lod]);
+                                base += (1 << shift);
+                        }
+                        break;
+
+                        case TEX_APAL8:
+                        pal = voodoo->palette[tmu];
+                        for (y = 0; y < voodoo->params.tex_h_mask[tmu][lod]+1; y++)
+                        {
+                                for (x = 0; x < voodoo->params.tex_w_mask[tmu][lod]+1; x++)
+                                {
+                                        uint8_t dat = voodoo->tex_mem[tmu][(tex_addr+x) & voodoo->texture_mask];
+
+                                        int r = ((pal[dat].rgba.r & 3) << 6) | ((pal[dat].rgba.g & 0xf0) >> 2) | (pal[dat].rgba.r & 3);
+                                        int g = ((pal[dat].rgba.g & 0xf) << 4) | ((pal[dat].rgba.b & 0xc0) >> 4) | ((pal[dat].rgba.g & 0xf) >> 2);
+                                        int b = ((pal[dat].rgba.b & 0x3f) << 2) | ((pal[dat].rgba.b & 0x30) >> 4);
+                                        int a = (pal[dat].rgba.r & 0xfc) | ((pal[dat].rgba.r & 0xc0) >> 6);
+
+                                        base[x] = makergba(r, g, b, a);
+                                }
+                                tex_addr += (1 << voodoo->params.tex_shift[tmu][lod]);
+                                base += (1 << shift);
+                        }
+                        break;
+
+                        case TEX_ARGB8332:
+                        for (y = 0; y < voodoo->params.tex_h_mask[tmu][lod]+1; y++)
+                        {
+                                for (x = 0; x < voodoo->params.tex_w_mask[tmu][lod]+1; x++)
+                                {
+                                        uint16_t dat = *(uint16_t *)&voodoo->tex_mem[tmu][(tex_addr + x*2) & voodoo->texture_mask];
+
+                                        base[x] = makergba(rgb332[dat & 0xff].r, rgb332[dat & 0xff].g, rgb332[dat & 0xff].b, dat >> 8);
+                                }
+                                tex_addr += (1 << (voodoo->params.tex_shift[tmu][lod]+1));
+                                base += (1 << shift);
+                        }
+                        break;
+
+                        case TEX_A8Y4I2Q2:
+                        pal = voodoo->ncc_lookup[tmu][(voodoo->params.textureMode[tmu] & TEXTUREMODE_NCC_SEL) ? 1 : 0];
+                        for (y = 0; y < voodoo->params.tex_h_mask[tmu][lod]+1; y++)
+                        {
+                                for (x = 0; x < voodoo->params.tex_w_mask[tmu][lod]+1; x++)
+                                {
+                                        uint16_t dat = *(uint16_t *)&voodoo->tex_mem[tmu][(tex_addr + x*2) & voodoo->texture_mask];
+
+                                        base[x] = makergba(pal[dat & 0xff].rgba.r, pal[dat & 0xff].rgba.g, pal[dat & 0xff].rgba.b, dat >> 8);
+                                }
+                                tex_addr += (1 << (voodoo->params.tex_shift[tmu][lod]+1));
+                                base += (1 << shift);
+                        }
+                        break;
+
+                        case TEX_R5G6B5:
+                        for (y = 0; y < voodoo->params.tex_h_mask[tmu][lod]+1; y++)
+                        {
+                                for (x = 0; x < voodoo->params.tex_w_mask[tmu][lod]+1; x++)
+                                {
+                                        uint16_t dat = *(uint16_t *)&voodoo->tex_mem[tmu][(tex_addr + x*2) & voodoo->texture_mask];
+
+                                        base[x] = makergba(rgb565[dat].r, rgb565[dat].g, rgb565[dat].b, 0xff);
+                                }
+                                tex_addr += (1 << (voodoo->params.tex_shift[tmu][lod]+1));
+                                base += (1 << shift);
+                        }
+                        break;
+
+                        case TEX_ARGB1555:
+                        for (y = 0; y < voodoo->params.tex_h_mask[tmu][lod]+1; y++)
+                        {
+                                for (x = 0; x < voodoo->params.tex_w_mask[tmu][lod]+1; x++)
+                                {
+                                        uint16_t dat = *(uint16_t *)&voodoo->tex_mem[tmu][(tex_addr + x*2) & voodoo->texture_mask];
+
+                                        base[x] = makergba(argb1555[dat].r, argb1555[dat].g, argb1555[dat].b, argb1555[dat].a);
+                                }
+                                tex_addr += (1 << (voodoo->params.tex_shift[tmu][lod]+1));
+                                base += (1 << shift);
+                        }
+                        break;
+
+                        case TEX_ARGB4444:
+                        for (y = 0; y < voodoo->params.tex_h_mask[tmu][lod]+1; y++)
+                        {
+                                for (x = 0; x < voodoo->params.tex_w_mask[tmu][lod]+1; x++)
+                                {
+                                        uint16_t dat = *(uint16_t *)&voodoo->tex_mem[tmu][(tex_addr + x*2) & voodoo->texture_mask];
+
+                                        base[x] = makergba(argb4444[dat].r, argb4444[dat].g, argb4444[dat].b, argb4444[dat].a);
+                                }
+                                tex_addr += (1 << (voodoo->params.tex_shift[tmu][lod]+1));
+                                base += (1 << shift);
+                        }
+                        break;
+
+                        case TEX_A8I8:
+                        for (y = 0; y < voodoo->params.tex_h_mask[tmu][lod]+1; y++)
+                        {
+                                for (x = 0; x < voodoo->params.tex_w_mask[tmu][lod]+1; x++)
+                                {
+                                        uint16_t dat = *(uint16_t *)&voodoo->tex_mem[tmu][(tex_addr + x*2) & voodoo->texture_mask];
+
+                                        base[x] = makergba(dat & 0xff, dat & 0xff, dat & 0xff, dat >> 8);
+                                }
+                                tex_addr += (1 << (voodoo->params.tex_shift[tmu][lod]+1));
+                                base += (1 << shift);
+                        }
+                        break;
+
+                        case TEX_APAL88:
+                        pal = voodoo->palette[tmu];
+                        for (y = 0; y < voodoo->params.tex_h_mask[tmu][lod]+1; y++)
+                        {
+                                for (x = 0; x < voodoo->params.tex_w_mask[tmu][lod]+1; x++)
+                                {
+                                        uint16_t dat = *(uint16_t *)&voodoo->tex_mem[tmu][(tex_addr + x*2) & voodoo->texture_mask];
+
+                                        base[x] = makergba(pal[dat & 0xff].rgba.r, pal[dat & 0xff].rgba.g, pal[dat & 0xff].rgba.b, dat >> 8);
+                                }
+                                tex_addr += (1 << (voodoo->params.tex_shift[tmu][lod]+1));
+                                base += (1 << shift);
+                        }
+                        break;
+
+                        default:
+                        fatal("Unknown texture format %i\n", params->tformat[tmu]);
+                }
+        }
+
+        voodoo->texture_cache[tmu][c].is16 = voodoo->params.tformat[tmu] & 8;
+
+        if (params->tformat[tmu] == TEX_PAL8 || params->tformat[tmu] == TEX_APAL8 || params->tformat[tmu] == TEX_APAL88)
+                voodoo->texture_cache[tmu][c].palette_checksum = palette_checksum;
+        else
+                voodoo->texture_cache[tmu][c].palette_checksum = 0;
+
+        if (lod_min == 0)
+        {
+                voodoo->texture_cache[tmu][c].addr_start[0] = voodoo->params.tex_base[tmu][0];
+                voodoo->texture_cache[tmu][c].addr_end[0] = voodoo->params.tex_end[tmu][0];
+        }
+        else
+                voodoo->texture_cache[tmu][c].addr_start[0] = voodoo->texture_cache[tmu][c].addr_end[0] = 0;
+
+        if (lod_min <= 1 && lod_max >= 1)
+        {
+                voodoo->texture_cache[tmu][c].addr_start[1] = voodoo->params.tex_base[tmu][1];
+                voodoo->texture_cache[tmu][c].addr_end[1] = voodoo->params.tex_end[tmu][1];
+        }
+        else
+                voodoo->texture_cache[tmu][c].addr_start[1] = voodoo->texture_cache[tmu][c].addr_end[1] = 0;
+
+        if (lod_min <= 2 && lod_max >= 2)
+        {
+                voodoo->texture_cache[tmu][c].addr_start[2] = voodoo->params.tex_base[tmu][2];
+                voodoo->texture_cache[tmu][c].addr_end[2] = voodoo->params.tex_end[tmu][2];
+        }
+        else
+                voodoo->texture_cache[tmu][c].addr_start[2] = voodoo->texture_cache[tmu][c].addr_end[2] = 0;
+
+        if (lod_max >= 3)
+        {
+                voodoo->texture_cache[tmu][c].addr_start[3] = voodoo->params.tex_base[tmu][(lod_min > 3) ? lod_min : 3];
+                voodoo->texture_cache[tmu][c].addr_end[3] = voodoo->params.tex_end[tmu][(lod_max < 8) ? lod_max : 8];
+        }
+        else
+                voodoo->texture_cache[tmu][c].addr_start[3] = voodoo->texture_cache[tmu][c].addr_end[3] = 0;
+
+
+        for (d = 0; d < 4; d++)
+        {
+                addr = voodoo->texture_cache[tmu][c].addr_start[d];
+                addr_end = voodoo->texture_cache[tmu][c].addr_end[d];
+
+                if (addr_end != 0)
+                {
+                        for (; addr <= addr_end; addr += (1 << TEX_DIRTY_SHIFT))
+                                voodoo->texture_present[tmu][(addr & voodoo->texture_mask) >> TEX_DIRTY_SHIFT] = 1;
+                }
+        }
+
+        params->tex_entry[tmu] = c;
+        voodoo->texture_cache[tmu][c].refcount++;
+}
+
+void flush_texture_cache(voodoo_t *voodoo, uint32_t dirty_addr, int tmu)
+{
+        int wait_for_idle = 0;
+        int c;
+
+        memset(voodoo->texture_present[tmu], 0, sizeof(voodoo->texture_present[0]));
+//        pclog("Evict %08x %i\n", dirty_addr, sizeof(voodoo->texture_present));
+        for (c = 0; c < TEX_CACHE_MAX; c++)
+        {
+                if (voodoo->texture_cache[tmu][c].base != -1)
+                {
+                        int d;
+
+                        for (d = 0; d < 4; d++)
+                        {
+                                int addr_start = voodoo->texture_cache[tmu][c].addr_start[d];
+                                int addr_end = voodoo->texture_cache[tmu][c].addr_end[d];
+
+                                if (addr_end != 0)
+                                {
+                                        int addr_start_masked = addr_start & voodoo->texture_mask & ~0x3ff;
+                                        int addr_end_masked = ((addr_end & voodoo->texture_mask) + 0x3ff) & ~0x3ff;
+
+                                        if (addr_end_masked < addr_start_masked)
+                                                addr_end_masked = voodoo->texture_mask+1;
+                                        if (dirty_addr >= addr_start_masked && dirty_addr < addr_end_masked)
+                                        {
+//                                pclog("  Evict texture %i %08x\n", c, voodoo->texture_cache[tmu][c].base);
+
+                                                if (voodoo->texture_cache[tmu][c].refcount != voodoo->texture_cache[tmu][c].refcount_r[0] ||
+                                                    (voodoo->render_threads == 2 && voodoo->texture_cache[tmu][c].refcount != voodoo->texture_cache[tmu][c].refcount_r[1]))
+                                                        wait_for_idle = 1;
+
+                                                voodoo->texture_cache[tmu][c].base = -1;
+                                        }
+                                        else
+                                        {
+                                                for (; addr_start <= addr_end; addr_start += (1 << TEX_DIRTY_SHIFT))
+                                                        voodoo->texture_present[tmu][(addr_start & voodoo->texture_mask) >> TEX_DIRTY_SHIFT] = 1;
+                                        }
+                                }
+                        }
+                }
+        }
+        if (wait_for_idle)
+                voodoo_wait_for_render_thread_idle(voodoo);
+}
+
+void voodoo_tex_writel(uint32_t addr, uint32_t val, void *p)
+{
+        int lod, s, t;
+        voodoo_t *voodoo = (voodoo_t *)p;
+        int tmu;
+
+        if (addr & 0x400000)
+                return; /*TREX != 0*/
+
+        tmu = (addr & 0x200000) ? 1 : 0;
+
+        if (tmu && !voodoo->dual_tmus)
+                return;
+
+        if (voodoo->type < VOODOO_BANSHEE)
+        {
+                if (!(voodoo->params.tformat[tmu] & 8) && voodoo->type >= VOODOO_BANSHEE)
+                {
+                        lod = (addr >> 16) & 0xf;
+                        t = (addr >> 8) & 0xff;
+                }
+                else
+                {
+                        lod = (addr >> 17) & 0xf;
+                        t = (addr >> 9) & 0xff;
+                }
+                if (voodoo->params.tformat[tmu] & 8)
+                        s = (addr >> 1) & 0xfe;
+                else
+                {
+                        if ((voodoo->params.textureMode[tmu] & (1 << 31)) || voodoo->type >= VOODOO_BANSHEE)
+                                s = addr & 0xfc;
+                        else
+                                s = (addr >> 1) & 0xfc;
+                }
+                if (lod > LOD_MAX)
+                        return;
+
+//                if (addr >= 0x200000)
+//                        return;
+
+                if (voodoo->params.tformat[tmu] & 8)
+                        addr = voodoo->params.tex_base[tmu][lod] + s*2 + (t << voodoo->params.tex_shift[tmu][lod])*2;
+                else
+                        addr = voodoo->params.tex_base[tmu][lod] + s + (t << voodoo->params.tex_shift[tmu][lod]);
+        }
+        else
+                addr = (addr & 0x1ffffc) + voodoo->params.tex_base[tmu][0];
+
+        if (voodoo->texture_present[tmu][(addr & voodoo->texture_mask) >> TEX_DIRTY_SHIFT])
+        {
+//                pclog("texture_present at %08x %i\n", addr, (addr & voodoo->texture_mask) >> TEX_DIRTY_SHIFT);
+                flush_texture_cache(voodoo, addr & voodoo->texture_mask, tmu);
+        }
+        if (voodoo->type == VOODOO_3 && voodoo->texture_present[tmu^1][(addr & voodoo->texture_mask) >> TEX_DIRTY_SHIFT])
+        {
+//                pclog("texture_present at %08x %i\n", addr, (addr & voodoo->texture_mask) >> TEX_DIRTY_SHIFT);
+                flush_texture_cache(voodoo, addr & voodoo->texture_mask, tmu^1);
+        }
+        *(uint32_t *)(&voodoo->tex_mem[tmu][addr & voodoo->texture_mask]) = val;
+}
diff --git a/pcem/vid_voodoo_texture.h b/pcem/vid_voodoo_texture.h
new file mode 100644 (file)
index 0000000..e4c5602
--- /dev/null
@@ -0,0 +1,19 @@
+static const uint32_t texture_offset[LOD_MAX+3] =
+{
+        0,
+        256*256,
+        256*256 + 128*128,
+        256*256 + 128*128 + 64*64,
+        256*256 + 128*128 + 64*64 + 32*32,
+        256*256 + 128*128 + 64*64 + 32*32 + 16*16,
+        256*256 + 128*128 + 64*64 + 32*32 + 16*16 + 8*8,
+        256*256 + 128*128 + 64*64 + 32*32 + 16*16 + 8*8 + 4*4,
+        256*256 + 128*128 + 64*64 + 32*32 + 16*16 + 8*8 + 4*4 + 2*2,
+        256*256 + 128*128 + 64*64 + 32*32 + 16*16 + 8*8 + 4*4 + 2*2 + 1*1,
+        256*256 + 128*128 + 64*64 + 32*32 + 16*16 + 8*8 + 4*4 + 2*2 + 1*1 + 1
+};
+
+void voodoo_recalc_tex(voodoo_t *voodoo, int tmu);
+void voodoo_use_texture(voodoo_t *voodoo, voodoo_params_t *params, int tmu);
+void voodoo_tex_writel(uint32_t addr, uint32_t val, void *p);
+void flush_texture_cache(voodoo_t *voodoo, uint32_t dirty_addr, int tmu);