--- /dev/null
+
+ .chip 68020
+ .globl _inflate
+
+
+/*
+ * inflate.S
+ *
+ * Decompression of DEFLATE streams, as produced by zip/gzip/pkzip and
+ * specified in RFC 1951 "DEFLATE Compressed Data Format Specification".
+ *
+ * Usage: Optionally configure the OPT_xxx options below at build time;
+ * at run time 'bsr inflate' with arguments:
+ * a4 = output buffer, a5 = input stream
+ * a6 = *end* of temporary storage area (only if OPT_STORAGE_OFFSTACK)
+ * All register values (including arguments) are preserved.
+ *
+ * Space requirements: 638-930 bytes code; 2044-2940 bytes stack.
+ * (NB1. Above ranges are [No Optimisations]-[All Optimisations])
+ * (NB2. Stack space can be relocated to a separately-specified storage
+ * area, see OPT_STORAGE_OFFSTACK below)
+ *
+ * Timings: With all Optimisation Options enabled (see below) this routine
+ * will decompress on a basic 7MHz 68000 at ~25kB/s. An AmigaDOS track of
+ * data (5.5kB) is processed in ~220ms. This is only fractionally slower than
+ * the track can be fetched from disk, hence there is scope for a
+ * decompressing loader to keep CPU and disk both at near 100% utilisation.
+ *
+ * Written & released by Keir Fraser <keir.xen@gmail.com>
+ *
+ * This is free and unencumbered software released into the public domain.
+ * See the file COPYING for more details, or visit <http://unlicense.org>.
+ */
+
+/* Optimisation Option #1:
+ * Avoid long Huffman-tree walks by indexing the first 8 bits of each codeword
+ * in a 256-entry lookup table. This shortens all walks by 8 steps and since
+ * the most common codes are less than 8 bits, most tree walks are avoided.
+ * Also pre-shifts selected symbols in the code->symbol table, ready to be used
+ * as indexes into further lookup tables.
+ * SPEEDUP: 41% (c.w. no Options); COST: 122 bytes code, 896 bytes stack */
+#ifndef OPT_TABLE_LOOKUP
+#define OPT_TABLE_LOOKUP 1
+#endif
+
+/* Optimisation Option #2:
+ * Inline functions in the main decode loop to avoid all BSR/RTS pairs.
+ * SPEEDUP: 15% (on top of Option #1); COST: 164 bytes code */
+#ifndef OPT_INLINE_FUNCTIONS
+#define OPT_INLINE_FUNCTIONS 1
+#endif
+
+/* Optimisation Option #3:
+ * Unroll the copy loop for <distance,length> tuples by one iteration
+ * (so two bytes are copied per iteration).
+ * SPEEDUP: ~1% (on top of Options #1 and #2); COST: 6 bytes code */
+#ifndef OPT_UNROLL_COPY_LOOP
+#define OPT_UNROLL_COPY_LOOP 1
+#endif
+
+/* Storage Option:
+ * All but 12 bytes of this routine's space requirement can be allocated
+ * off stack, in a data area specified in register a6.
+ * If this option is set then inflate must be called with a6 pointing at
+ * the *end* of the reserved storage area (+2032 or +2928 bytes, depending
+ * on whether OPT_TABLE_LOOKUP is enabled).
+ * SPEEDUP: none; COST: -2 bytes code (makes code slightly smaller) */
+#ifndef OPT_STORAGE_OFFSTACK
+#define OPT_STORAGE_OFFSTACK 0
+#endif
+
+/* By default all lookup/conversion tables are generated on-the-fly on every
+ * call to inflate. In some cases this can be very inefficient.
+ * If this option is enabled then two new routines are generated: At start-of-
+ * day call 'inflate_gentables' with a6 pointing to the *end* of a 6000-byte
+ * block of memory. Then call 'inflate_fromtables' instead of 'inflate', with
+ * a6 still pointing to the end of the pre-generated memory block.
+ * SPEEDUP: variable; COST: 116 bytes code */
+#ifndef OPT_PREGENERATE_TABLES
+#define OPT_PREGENERATE_TABLES 0
+#endif
+
+/* By default all registers are saved/restored across 'inflate' and
+ * 'inflate_fromtables'. This set can be reduced below. Note that if
+ * a4 is not saved then it will point at the end of the uncompressed output.
+ * If a5 is not saved then it will point at the end of the DEFLATE stream. */
+#ifndef SAVE_RESTORE_REGS
+#define SAVE_RESTORE_REGS d0-d6/a0-a3
+#endif
+
+#if OPT_STORAGE_OFFSTACK
+#define aS a6
+#else
+#define aS sp
+#endif
+
+/* Longest possible code. */
+#define MAX_CODE_LEN 16
+
+/* (Maximum) alphabet sizes. */
+#define nr_codelen_symbols 19
+#define nr_litlen_symbols 288
+#define nr_distance_symbols 32
+
+/* Alphabet-description stream for a static Huffman block (BTYPE=01b). */
+static_huffman_prefix:
+ dc.b 0xff, 0x5b, 0x00, 0x6c, 0x03, 0x36, 0xdb
+ dc.b 0xb6, 0x6d, 0xdb, 0xb6, 0x6d, 0xdb, 0xb6
+ dc.b 0xcd, 0xdb, 0xb6, 0x6d, 0xdb, 0xb6, 0x6d
+ dc.b 0xdb, 0xa8, 0x6d, 0xce, 0x8b, 0x6d, 0x3b
+
+#if OPT_TABLE_LOOKUP
+
+/* Number of bytes required for code-lookup table/tree:
+ * - 256 2-byte entries for the 8-bit lookup table
+ * - Worst-case only 8 symbols decode directly in the table and all the rest
+ * are in a tree hanging off one table entry. This tree requires
+ * (nr_symbols-8)-1 internal 4-byte nodes. */
+#define LOOKUP_BYTES(nr_syms) (256*2+((nr_syms)-9)*4)
+
+ /* a0 = len[], a1 = nodes[], d0 = nr_symbols */
+ /* d1 = symbol beyond which all symbols get <<2 */
+ /* a2-a3 are scratched */
+build_code:
+ movem.l d0-d7,-(aS)
+
+ /* Allocate space for bl_count[]/next_code[] array on stack. */
+ moveq #(MAX_CODE_LEN+1)/2,d1
+ moveq #0,d2
+1: move.l d2,-(aS)
+ dbf d1,1b
+
+ /* Count occurrences of each code length into bl_count[] array. */
+ subq.w #1,d0
+ move.w d0,d1
+ move.l a0,a2 /* a2 = &len[0] */
+1: move.b (a2)+,d2 /* d2 = len[i] */
+#if MC68020
+ addq.w #1,(aS,d2.w*2)
+#else
+ add.b d2,d2
+ addq.w #1,(aS,d2.w) /* bl_count[len[i]]++ */
+#endif
+ dbf d1,1b
+
+ /* Calculate next_code[] start values for each code length. */
+ move.l aS,a2 /* a2 = bl_count[] / next_code[] */
+ moveq #MAX_CODE_LEN-1,d1
+ moveq #0,d2 /* d2 = code */
+ move.w d2,(aS) /* bl_count[0] = 0, ignore zero-length codes */
+1: add.w (a2),d2
+ add.w d2,d2 /* code = (code + bl_count[i-1]) << 1 */
+ move.w d2,(a2)+ /* next_code[i] = code */
+ dbf d1,1b
+
+ /* Create the Huffman-code lookup tree */
+ move.w d0,d1
+ moveq #127,d4 /* d4 = next_node = 127 */
+ move.l a0,a2 /* a2 = &len[0] */
+1: moveq #0,d5
+ move.b (a2)+,d5 /* d5 = len[i] / *len++ */
+ jeq 4f
+ subq.w #1,d5
+ move.w d5,d6
+#if MC68020
+ move.w (aS,d6.w*2),d3
+ addq.w #1,(aS,d6.w*2)
+#else
+ add.w d6,d6
+ move.w (aS,d6.w),d3 /* d3 = code = next_code[len[i]]++ */
+ addq.w #1,(aS,d6.w)
+ move.w d5,d6
+#endif
+
+ moveq #0,d2
+9: lsr.w #1,d3
+ roxl.w #1,d2
+ dbf d6,9b /* d5 = codelen-1; d2 = reversed code */
+ move.b d2,d3
+ add.w d3,d3 /* d3 = table offset */
+ move.w d0,d6
+ sub.w d1,d6 /* d6 = symbol */
+ cmp.w (((MAX_CODE_LEN+1)/2)+1)*4+6(aS),d6 /* symbol > saved d1.w? */
+ jls 9f
+ lsl.w #2,d6 /* symbol <<= 2 if so */
+9: cmp.b #9-1,d5
+ jcc codelen_gt_8
+
+codelen_le_8: /* codelen <= 8: leaf in table entry(s) */
+ lsl.w #3,d6
+ or.b d5,d6 /* d6 = (symbol<<3) | (codelen-1) */
+ moveq #0,d2
+ addq.b #2,d5
+ bset d5,d2 /* d2 = 1<<(codelen+1) [table step] */
+ move.w d2,d7
+ neg.w d7
+ and.w #511,d7
+ or.w d7,d3 /* d3 = last table offset */
+9: move.w d6,(a1,d3.w)
+ sub.w d2,d3
+ jcc 9b
+ jra 4f
+
+codelen_gt_8: /* codelen > 8: requires a tree walk */
+ lsr.w #8,d2
+ subq.b #8,d5 /* Skip the first 8 bits of code */
+ lea (a1,d3.w),a3 /* pnode = table entry */
+
+2: /* Walk through *pnode. */
+ move.w (a3),d7 /* d3 = *pnode */
+ jne 3f
+ /* Link missing: Create a new internal node */
+ addq.w #1,d4
+ move.w d4,d7
+ bset #15,d7
+ move.w d7,(a3) /* *pnode = ++next_node | INTERNAL */
+3: /* Take left or right branch depending on next code bit */
+ lsr.b #1,d2
+ addx.w d7,d7
+#if MC68020
+ lea (a1,d7.w*2),a3
+#else
+ add.w d7,d7
+ lea (a1,d7.w),a3 /* pnode = next_bit ? &node->r : &node->l */
+#endif
+3: dbf d5,2b
+
+ /* Insert the current symbol as a new leaf node */
+ move.w d6,(a3) /* *pnode = sym */
+4: dbf d1,1b
+
+ lea (((MAX_CODE_LEN+1)/2)+1)*4(aS),aS
+ movem.l (aS)+,d0-d7
+ rts
+
+ /* d5-d6/a5 = stream, a0 = tree */
+ /* d0.w = result, d1.l = scratch */
+.macro STREAM_NEXT_SYMBOL
+ moveq #0,d0 /* 4 */
+ moveq #7,d1 /* 4 */
+ cmp.b d1,d6 /* 4 */
+ jhi 99f /* 10 */
+ /* Less than 8 bits cached; grab another byte from the stream */
+ move.b (a5)+,d0 /* [8] */
+ lsl.w d6,d0 /* [~14] */
+ or.w d0,d5 /* [4] */ /* s->cur |= *p++ << s->nr */
+ addq.b #8,d6 /* [4] */ /* s->nr += 8 */
+ moveq #0,d0 /* [4] */
+99: /* Use next input byte as index into code lookup table */
+ move.b d5,d0 /* 4 */
+#if MC68020
+ move.w (a0,d0.w*2),d0
+#else
+ add.w d0,d0 /* 4 */
+ move.w (a0,d0.w),d0 /* 14 */
+#endif
+ jpl 99f /* 10 (taken) */
+ /* Code is longer than 8 bits: do the remainder via a tree walk */
+ lsr.w #8,d5
+ subq.b #8,d6 /* consume 8 bits from the stream */
+98: /* stream_next_bits(1), inlined & optimised */
+ subq.b #1,d6 /* 4 cy */
+ jcc 97f /* 10 cy (taken) */
+ move.b (a5)+,d5 /* [8 cy] */
+ moveq #7,d6 /* [4 cy] */
+97: lsr.w #1,d5 /* 8 cy */
+ addx.w d0,d0 /* 4 cy */
+#if MC68020
+ move.w (a0,d0.w*2),d0
+#else
+ add.w d0,d0 /* 4 cy */
+ move.w (a0,d0.w),d0 /* 14 cy */
+#endif
+ jmi 98b /* 10 cy (taken); loop on INTERNAL flag */
+ jra 98f /* TOTAL LOOP CYCLES ~= 54 */
+99: /* Symbol found directly: consume bits and return symbol */
+ and.b d0,d1 /* 4 */
+ addq.b #1,d1 /* 4 */
+ lsr.w d1,d5 /* ~16 */ /* consume bits from the stream */
+ sub.b d1,d6 /* 4 */
+ lsr.w #3,d0 /* 12 */ /* d0 = symbol */
+98: /* ~94 CYCLES TOTAL [+ 34] */
+.endm
+
+#else /* !OPT_TABLE_LOOKUP */
+
+/* Number of bytes required for code-lookup tree:
+ * - Every binary tree with N leaves has N-1 internal nodes.
+ * - Internal nodes require 4 bytes each. Leaves are free. */
+#define LOOKUP_BYTES(nr_syms) (((nr_syms)-1)*4)
+
+ /* a0 = len[], a1 = nodes[], d0 = nr_symbols */
+ /* a2-a3 are scratched */
+build_code:
+ movem.l d0-d5,-(aS)
+
+ /* Allocate space for bl_count[]/next_code[] array on stack. */
+ moveq #(MAX_CODE_LEN+1)/2,d1
+ moveq #0,d2
+1: move.l d2,-(aS)
+ dbf d1,1b
+
+ /* Count occurrences of each code length into bl_count[] array. */
+ subq.w #1,d0
+ move.w d0,d1
+ move.l a0,a2 /* a2 = &len[0] */
+1: move.b (a2)+,d2 /* d2 = len[i] */
+ add.b d2,d2
+ addq.w #1,(aS,d2.w) /* bl_count[len[i]]++ */
+ dbf d1,1b
+
+ /* Calculate next_code[] start values for each code length. */
+ move.l aS,a2 /* a2 = bl_count[] / next_code[] */
+ moveq #MAX_CODE_LEN-1,d1
+ moveq #0,d2 /* d2 = code */
+ move.w d2,(aS) /* bl_count[0] = 0, ignore zero-length codes */
+1: add.w (a2),d2
+ add.w d2,d2 /* code = (code + bl_count[i-1]) << 1 */
+ move.w d2,(a2)+ /* next_code[i] = code */
+ dbf d1,1b
+
+ /* Create the Huffman-code lookup tree */
+ move.w d0,d1
+ moveq #0,d4 /* d4 = next_node */
+ move.l a0,a2 /* a2 = &len[0] */
+1: moveq #0,d5
+ move.b (a2)+,d5 /* d5 = len[i] / *len++ */
+ jeq 4f
+ subq.w #1,d5
+ add.w d5,d5
+ move.w (aS,d5.w),d3 /* d3 = code = next_code[len[i]]++ */
+ addq.w #1,(aS,d5.w)
+ lsr.w #1,d5
+ /* Walk down the tree, creating nodes as necessary */
+ moveq #0,d2 /* d2 = 0 (root node) */
+ jra 3f
+
+2: /* Walk through *pnode. */
+ move.w (a3),d2 /* d2 = *pnode */
+ jne 3f
+ /* Link missing: Create a new internal node */
+ addq.w #1,d4
+ move.w d4,d2
+ bset #15,d2
+ move.w d2,(a3) /* *pnode = ++next_node | INTERNAL */
+3: /* Take left or right branch depending on next code bit */
+ lsl.w #2,d2
+ btst d5,d3
+ jeq 3f
+ addq.w #2,d2
+3: lea (a1,d2.w),a3 /* pnode = next_bit ? &node->r : &node->l */
+ dbf d5,2b
+
+ /* Insert the current symbol as a new leaf node */
+ move.w d0,d2
+ sub.w d1,d2
+ move.w d2,(a3) /* *pnode = sym */
+4: dbf d1,1b
+
+ lea (((MAX_CODE_LEN+1)/2)+1)*4(aS),aS
+ movem.l (aS)+,d0-d5
+ rts
+
+ /* d5-d6/a5 = stream, a0 = tree */
+ /* d0.w = result */
+.macro STREAM_NEXT_SYMBOL
+ moveq #0,d0
+99: /* stream_next_bits(1), inlined & optimised */
+ subq.b #1,d6 /* 4 cy */
+ jcc 98f /* 10 cy (taken) */
+ move.b (a5)+,d5 /* [8 cy] */
+ moveq #7,d6 /* [4 cy] */
+98: lsr.w #1,d5 /* 8 cy */
+ addx.w d0,d0 /* 4 cy */
+ add.w d0,d0 /* 4 cy */
+ move.w (a0,d0.w),d0 /* 14 cy */
+ jmi 99b /* 10 cy (taken); loop on INTERNAL flag set */
+ /* TOTAL LOOP CYCLES ~= 54 */
+.endm
+
+#endif
+
+ /* d1.b = nr, d5-d6/a5 = stream [fetched_bits/nr_fetched_bits/inp] */
+ /* d0.w = result */
+.macro STREAM_NEXT_BITS
+99: moveq #0,d0
+ cmp.b d1,d6
+ jcc 99f /* while (s->nr < nr) */
+ move.b (a5)+,d0
+ lsl.l d6,d0
+ or.l d0,d5 /* s->cur |= *p++ << s->nr */
+ addq.b #8,d6 /* s->nr += 8 */
+ jra 99b
+99: bset d1,d0
+ subq.w #1,d0 /* d0 = (1<<nr)-1 */
+ and.w d5,d0 /* d0 = s->cur & ((1<<nr)-1) */
+ lsr.l d1,d5 /* s->cur >>= nr */
+ sub.b d1,d6 /* s->nr -= nr */
+.endm
+
+#if OPT_INLINE_FUNCTIONS
+#define INLINE_stream_next_bits STREAM_NEXT_BITS
+#define INLINE_stream_next_symbol STREAM_NEXT_SYMBOL
+#else
+#define INLINE_stream_next_bits jbsr stream_next_bits
+#define INLINE_stream_next_symbol jbsr stream_next_symbol
+#endif
+
+stream_next_bits:
+ STREAM_NEXT_BITS
+ rts
+
+ /* d5-d6/a5 = stream, a4 = output */
+ /* d0-d1 are scratched */
+uncompressed_block:
+#if OPT_TABLE_LOOKUP
+ /* Push whole bytes back into input stream. */
+ lsr.w #3,d6
+ sub.w d6,a5
+#else
+ /* No need to push bytes back into input stream because stream_next_
+ * {bits,symbol} will never leave more than 7 bits cached. */
+#endif
+ /* Snap input stream up to byte boundary. */
+ moveq #0,d5
+ moveq #0,d6
+ /* Read block header and copy LEN bytes. */
+ moveq #16,d1
+ jbsr stream_next_bits /* LEN */
+ addq.w #2,a5 /* skip NLEN */
+ subq.w #1,d0 /* d0.w = len-1 (for dbf) */
+1: move.b (a5)+,(a4)+
+ dbf d0,1b
+ rts
+
+#define o_hdist /*0*/
+#define o_hlit 2
+#define o_lens (o_hlit+2)
+#define o_codelen_tree (o_lens+nr_litlen_symbols+nr_distance_symbols)
+#if OPT_TABLE_LOOKUP
+/* Lit/len and codelen lookup structures share space. */
+#define o_litlen_tree o_codelen_tree
+#else
+#define o_litlen_tree (o_codelen_tree+LOOKUP_BYTES(nr_codelen_symbols))
+#endif
+#define o_dist_tree (o_litlen_tree+LOOKUP_BYTES(nr_litlen_symbols))
+#define o_stream (o_dist_tree+LOOKUP_BYTES(nr_distance_symbols))
+#define o_frame (o_stream+3*4)
+#if OPT_STORAGE_OFFSTACK
+#define o_mode (o_frame)
+#else
+/* Allow for BSR return address from decoder */
+#define o_mode (o_frame+4)
+#endif
+#define o_dist_extra (o_mode+4)
+#define o_length_extra (o_dist_extra+30*4)
+
+ /* d5-d6/a5 = stream, a4 = output */
+ /* d0-d4,a0-a3 are scratched */
+static_huffman:
+ movem.l d5-d6/a5,-(aS)
+ moveq #0,d5
+ moveq #0,d6
+ lea static_huffman_prefix(pc),a5
+ move.w #o_stream/4-2,d0
+ jra 1f
+
+ /* d5-d6/a5 = stream, a4 = output */
+ /* d0-d4,a0-a3 are scratched */
+dynamic_huffman:
+ /* Allocate stack space for len[] and node[] arrays */
+ move.w #o_frame/4-2,d0
+1: moveq #0,d1
+1: move.l d1,-(aS)
+ dbf d0,1b
+ /* HLIT = stream_next_bits(5) + 257 */
+ moveq #5,d1
+ jbsr stream_next_bits
+ add.w #257,d0
+ move.w d0,-(aS)
+ /* HDIST = stream_next_bits(5) + 1 */
+ moveq #5,d1
+ jbsr stream_next_bits
+ addq.w #1,d0
+ move.w d0,-(aS)
+ /* HCLEN = stream_next_bits(4) + 4 */
+ moveq #4,d1
+ jbsr stream_next_bits
+ addq.w #4-1,d0 /* -1 for dbf */
+ /* Initialise len[] array with code-length symbol code lengths */
+ lea codelen_order(pc),a1
+ lea o_lens(aS),a0 /* a0 = len[] */
+ moveq #0,d2
+ move.w d0,d3
+1: moveq #3,d1
+ jbsr stream_next_bits
+ move.b (a1)+,d2
+ move.b d0,(a0,d2.w) /* len[codelen_order[i++]] = next_bits(3) */
+ dbf d3,1b
+ /* Build the codelen_tree */
+ lea o_codelen_tree(aS),a1
+ moveq #nr_codelen_symbols,d0
+#if OPT_TABLE_LOOKUP
+ moveq #127,d1 /* don't left-shift any symbols */
+#endif
+ jbsr build_code /* build_code(codelen_tree) */
+ /* Read the literal/length & distance code lengths */
+ move.w o_hlit(aS),d2
+ add.w o_hdist(aS),d2
+ subq.w #1,d2 /* d2 = hlit+hdist-1 */
+ move.l a0,a2 /* a2 = len[] */
+ move.l a1,a0 /* a0 = a1 = codelen_tree */
+1: INLINE_stream_next_symbol
+ cmp.b #16,d0
+ jcs c_lit
+ jeq c_16
+ cmp.b #17,d0
+ jeq c_17
+c_18: /* 18: Repeat zero N times */
+ moveq #7,d1
+ jbsr stream_next_bits
+ addq.w #11-3,d0
+ jra 2f
+c_17: /* 17: repeat zero N times */
+ moveq #3,d1
+ jbsr stream_next_bits
+2: moveq #0,d1
+ jra 3f
+c_16: /* 16: repeat previous N times */
+ moveq #2,d1
+ jbsr stream_next_bits
+ move.b -1(a2),d1
+3: addq.w #3-1,d0
+ sub.w d0,d2
+4: move.b d1,(a2)+
+ dbf d0,4b
+ jra 5f
+c_lit: /* 0-16: Literal symbol */
+ move.b d0,(a2)+
+5: dbf d2,1b
+ /* Build the lit/len and distance trees */
+#if OPT_TABLE_LOOKUP
+ /* Clear the codelen tree (shared space with lit/len tree).
+ * NB. a0 = a1 = codelen_tree = litlen_tree */
+ moveq #0,d0
+ move.w #LOOKUP_BYTES(nr_codelen_symbols)/4-1,d1
+1: move.l d0,(a0)+
+ dbf d1,1b
+ /* litlen_tree (= codelen_tree) is already in a1, and now zeroed. */
+#else
+ lea o_litlen_tree(aS),a1
+#endif
+ lea o_lens(aS),a0
+ move.w o_hlit(aS),d0
+#if OPT_TABLE_LOOKUP
+ move.w #256,d1
+ move.w d1,d4 /* left-shift symbols >127 (i.e., lengths) */
+#endif
+ jbsr build_code /* build_code(litlen_tree) */
+ add.w d0,a0
+ lea o_dist_tree(aS),a1
+ move.w o_hdist(aS),d0
+#if OPT_TABLE_LOOKUP
+ moveq #0,d1 /* left-shift all symbols (i.e., distances) */
+#endif
+ jbsr build_code /* build_code(dist_tree) */
+ /* Reinstate the main stream if we used the static prefix */
+ tst.l o_stream+8(aS)
+ jeq decode_loop
+ movem.l o_stream(aS),d5-d6/a5
+ /* Now decode the compressed data stream up to EOB */
+decode_loop:
+ lea o_litlen_tree(aS),a0
+ /* START OF HOT LOOP */
+2: INLINE_stream_next_symbol /* litlen_sym */
+#if OPT_TABLE_LOOKUP
+ cmp.w d4,d0 /* 4 cy (d4.w = 256) */
+#else
+ cmp.w #256,d0 /* 8 cy */
+#endif
+ jcc 2f /* 8 cy */
+ /* 0-255: Byte literal */
+ move.b d0,(a4)+ /* 8 cy */
+ jra 2b /* 10 cy */
+ /* END OF HOT LOOP -- 30 + ~108 + [34] = ~160 CYCLES */
+9: /* 256: End-of-block: we're done */
+ lea o_frame(aS),aS
+ rts
+2: jeq 9b
+ /* 257+: <length,distance> pair */
+#if !OPT_TABLE_LOOKUP /* Already shifted in case of OPT_TABLE_LOOKUP */
+ lsl.w #2,d0
+#endif
+ lea o_length_extra-257*4(aS),a2
+ add.w d0,a2
+ move.w (a2)+,d1
+ INLINE_stream_next_bits
+ add.w (a2),d0
+ move.w d0,d3 /* d3 = cplen */
+ lea o_dist_tree(aS),a0
+ INLINE_stream_next_symbol /* dist_sym */
+#if !OPT_TABLE_LOOKUP /* Already shifted in case of OPT_TABLE_LOOKUP */
+ lsl.w #2,d0
+#endif
+ lea o_dist_extra(aS),a2
+ add.w d0,a2
+ move.w (a2)+,d1
+ INLINE_stream_next_bits
+ add.w (a2),d0 /* d0 = cpdst */
+ move.l a4,a0
+ sub.w d0,a0 /* a0 = outp - cpdst */
+#if OPT_UNROLL_COPY_LOOP
+ lsr.w #1,d3
+ jcs 4f
+ subq.w #1,d3
+3: move.b (a0)+,(a4)+
+4: move.b (a0)+,(a4)+
+#else
+ subq.w #1,d3
+3: move.b (a0)+,(a4)+
+#endif
+ dbf d3,3b
+ jra decode_loop
+
+#if !OPT_INLINE_FUNCTIONS
+stream_next_symbol:
+ STREAM_NEXT_SYMBOL
+ rts
+#endif
+
+ /* Build a base/extra-bits table on the stack.
+ * d0 = #pairs-1, d2 = max_value, d4 = log_2(extrabits_repeat) */
+build_base_extrabits:
+#if !OPT_STORAGE_OFFSTACK
+ move.l (sp)+,a0
+#endif
+1: move.w d0,d3
+ lsr.w d4,d3
+ subq.w #1,d3
+ jcc 2f
+ moveq #0,d3
+2: moveq #0,d1
+ bset d3,d1 /* d1 = 1 << extrabits */
+ sub.w d1,d2 /* d2 = base */
+ move.w d2,-(aS)
+ move.w d3,-(aS)
+ dbf d0,1b
+#if !OPT_STORAGE_OFFSTACK
+ jmp (a0)
+#else
+ rts
+#endif
+
+dispatch: /* Decoder dispatch table. */
+ dc.b uncompressed_block - uncompressed_block
+ dc.b static_huffman - uncompressed_block
+ dc.b dynamic_huffman - uncompressed_block
+
+codelen_order: /* Order of code lengths for the code length alphabet. */
+ dc.b 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
+
+ /* a4 = output, a5 = input, all regs preserved
+ * a6 = *end* of storage area (only if OPT_STORAGE_OFFSTACK) */
+_inflate:
+ movem.l SAVE_RESTORE_REGS,-(aS)
+
+ /* Build the <length> base/extra-bits table */
+ move.l #258,d2
+ move.l d2,-(aS)
+ addq.w #1,d2
+ moveq #27,d0
+ moveq #2,d4
+ jbsr build_base_extrabits
+
+ /* Build the <distance> base/extra-bits table */
+ move.w #32769,d2
+ moveq #29,d0
+ moveq #1,d4
+ jbsr build_base_extrabits
+
+ /* Initialise the stream */
+ moveq #0,d5 /* d5 = stream: fetched data */
+ moveq #0,d6 /* d6 = stream: nr fetched bits */
+
+1: /* Process a block: Grab the BTYPE|BFINAL 3-bit code */
+ moveq #3,d1
+ jbsr stream_next_bits
+ move.l d0,-(aS)
+ /* Dispatch to the correct decoder for this block */
+ lsr.b #1,d0
+ move.b dispatch(pc,d0.w),d0
+ lea uncompressed_block(pc),a0
+ jsr (a0,d0.w)
+ /* Keep going until we see BFINAL=1 */
+ move.l (aS)+,d0
+ lsr.b #1,d0
+ jcc 1b
+
+ /* Pop the base/extra-bits lookup tables */
+ lea (30+29)*4(aS),aS
+
+ movem.l (aS)+,SAVE_RESTORE_REGS
+ rts
+
+#if OPT_PREGENERATE_TABLES
+pregen_static_huffman:
+ lea -o_frame(aS),aS /* frame pre-generated; skip over it */
+ move.w #256,d4
+ jra decode_loop
+pregen_dynamic_huffman:
+ move.l (aS),d0
+ lea -3000(aS),aS /* move to dynamic-huffman frame */
+ move.l d0,(aS) /* copy o_mode into it */
+ jbsr dynamic_huffman
+ lea 3000(aS),aS
+ rts
+
+ /* Pre-generate conversion tables for Inflate. */
+ /* a6 = Pointer to end of 6000-byte block of memory to contain
+ * pre-generated tables. All registers preserved. */
+inflate_gentables:
+ movem.l a5-a6,-(sp)
+ lea pregen_dummy_block(pc),a5
+ jbsr inflate /* static block */
+ lea -3000(aS),aS
+ lea pregen_dummy_block(pc),a5
+ jbsr inflate /* dynamic block */
+ movem.l (sp)+,a5-a6
+ rts
+
+ /* Inflate, using pre-generated tables. */
+ /* a4 = output, a5 = input, all regs preserved
+ * a6 = *end* of 6000-byte pre-generated storage area */
+inflate_fromtables:
+ movem.l SAVE_RESTORE_REGS,-(aS)
+
+ /* Skip the pre-generated base/extra-bits lookup tables */
+ lea -(30+29)*4(aS),aS
+
+ /* Initialise the stream */
+ moveq #0,d5 /* d5 = stream: fetched data */
+ moveq #0,d6 /* d6 = stream: nr fetched bits */
+
+1: /* Process a block: Grab the BTYPE|BFINAL 3-bit code */
+ moveq #3,d1
+ jbsr stream_next_bits
+ move.l d0,-(aS)
+ /* Dispatch to the correct decoder for this block */
+ and.b #0xfe,d0
+ move.w pregen_dispatch(pc,d0.w),d0
+ lea uncompressed_block(pc),a0
+ jsr (a0,d0.w)
+ /* Keep going until we see BFINAL=1 */
+ move.l (aS)+,d0
+ lsr.b #1,d0
+ jcc 1b
+
+ /* Pop the base/extra-bits lookup tables */
+ lea (30+29)*4(aS),aS
+
+ movem.l (aS)+,SAVE_RESTORE_REGS
+ rts
+
+pregen_dispatch:
+ dc.w uncompressed_block - uncompressed_block
+ dc.w pregen_static_huffman - uncompressed_block
+ dc.w pregen_dynamic_huffman - uncompressed_block
+pregen_dummy_block: /* A single static block containing EOB symbol only */
+ dc.b 0x03,0x00
+#endif /* OPT_PREGENERATE_TABLES */
+
+#undef o_hdist
+#undef o_hlit
+#undef o_lens
+#undef o_codelen_tree
+#undef o_litlen_tree
+#undef o_dist_tree
+#undef o_frame
--- /dev/null
+
+/* Real hardware UAE state file loader */
+/* Copyright 2019 Toni Wilen */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <exec/types.h>
+#include <exec/execbase.h>
+#include <proto/exec.h>
+#include <proto/graphics.h>
+#include <proto/dos.h>
+#include <graphics/gfxbase.h>
+#include <dos/dosextens.h>
+#include <hardware/cia.h>
+#include <hardware/custom.h>
+
+#include "header.h"
+
+extern struct GfxBase *GfxBase;
+extern struct DosLibrary *DosBase;
+
+static const char *const chunknames[] =
+{
+ "ASF ",
+ "CPU ", "CHIP", "AGAC",
+ "CIAA", "CIAB", "ROM ",
+ "DSK0", "DSK1", "DSK2", "DSK3",
+ "AUD0", "AUD1", "AUD2", "AUD3",
+ "END ",
+ NULL
+};
+static const char *const memchunknames[] =
+{
+ "CRAM", "BRAM", "FRAM",
+ NULL
+};
+
+
+static ULONG getlong(UBYTE *chunk, int offset)
+{
+ ULONG v;
+
+ chunk += offset;
+ v = (chunk[0] << 24) | (chunk[1] << 16) | (chunk[2] << 8) | (chunk[3] << 0);
+ return v;
+}
+static ULONG getword(UBYTE *chunk, int offset)
+{
+ ULONG v;
+
+ chunk += offset;
+ v = (chunk[0] << 8) | (chunk[1] << 0);
+ return v;
+}
+
+static void set_agacolor(UBYTE *p)
+{
+ volatile struct Custom *c = (volatile struct Custom*)0xdff000;
+
+ int aga = (c->vposr & 0x0f00) == 0x0300;
+ if (!aga)
+ return;
+
+ for (int i = 0; i < 8; i++) {
+ for (int k = 0; k < 2; k++) {
+ c->bplcon3 = (i << 13) | (k ? (1 << 9) : 0);
+ for (int j = 0; j < 32; j++) {
+ ULONG c32 = getlong(p, j * 4);
+ if (k)
+ c32 >>= 4;
+ UWORD col = ((c32 & 0x00000f) << 0) | ((c32 & 0x000f00) >> 4) | ((c32 & 0x0f0000) >> 8);
+ c->color[j] = col;
+ }
+ }
+ }
+}
+
+static void wait_lines(WORD lines)
+{
+ volatile struct Custom *c = (volatile struct Custom*)0xdff000;
+
+ UWORD line = c->vhposr & 0xff00;
+ while (lines-- > 0) {
+ for (;;) {
+ UWORD line2 = c->vhposr & 0xff00;
+ if (line == line2)
+ continue;
+ line = line2;
+ break;
+ }
+ }
+}
+
+static void step_floppy(void)
+{
+ volatile struct CIA *ciab = (volatile struct CIA*)0xbfd000;
+ ciab->ciaprb &= ~CIAF_DSKSTEP;
+ // delay
+ ciab->ciaprb &= ~CIAF_DSKSTEP;
+ ciab->ciaprb |= CIAF_DSKSTEP;
+ wait_lines(300);
+}
+
+static void set_floppy(UBYTE *p, ULONG num)
+{
+ ULONG id = getlong(p, 0);
+ UBYTE state = p[4];
+ UBYTE track = p[5];
+
+ // drive disabled?
+ if (state & 2)
+ return;
+ // invalid track?
+ if (track >= 80)
+ return;
+
+ volatile struct CIA *ciaa = (volatile struct CIA*)0xbfe001;
+ volatile struct CIA *ciab = (volatile struct CIA*)0xbfd000;
+
+ ciab->ciaprb = 0xff;
+
+ // motor on?
+ if (state & 1) {
+ ciab->ciaprb &= ~CIAF_DSKMOTOR;
+ }
+ // select drive
+ ciab->ciaprb &= ~(CIAF_DSKSEL0 << num);
+
+ wait_lines(100);
+ int seekcnt = 80;
+ while (seekcnt-- > 0) {
+ if (!(ciaa->ciapra & CIAF_DSKTRACK0))
+ break;
+ step_floppy();
+ }
+ wait_lines(100);
+ if (seekcnt <= 0) {
+ // no track0 after 80 steps: drive missing or not responding
+ ciab->ciaprb |= CIAF_DSKMOTOR;
+ ciab->ciaprb |= CIAF_DSKSEL0 << num;
+ return;
+ }
+
+ ciab->ciaprb &= ~CIAF_DSKDIREC;
+ wait_lines(100);
+ for (UBYTE i = 0; i < track; i++) {
+ step_floppy();
+ }
+
+ ciab->ciaprb |= CIAF_DSKSEL0 << num;
+}
+
+static void set_audio(UBYTE *p, ULONG num)
+{
+ volatile UWORD *c = (volatile UWORD*)(0xdff0a0 + 16 * num);
+ c[8 / 2] = p[1]; // AUDxVOL
+ c[4 / 2] = getword(p, 1 + 1 + 1 + 1 + 2); // AUDxLEN
+ c[6 / 2] = getword(p, 1 + 1 + 1 + 1 + 2 + 2 + 2); // AUDxPER
+ c[0 / 2] = getword(p, 1 + 1 + 1 + 1 + 2 + 2 + 2 + 2); // AUDxLCH
+ c[2 / 2] = getword(p, 1 + 1 + 1 + 1 + 2 + 2 + 2 + 2 + 2); // AUDxLCL
+}
+
+static void set_sprite(UBYTE *p, ULONG num)
+{
+ volatile UWORD *cpt = (volatile UWORD*)(0xdff120 + 4 * num);
+ volatile UWORD *c = (volatile UWORD*)(0xdff140 + 8 * num);
+
+ cpt[0 / 2] = getword(p, 0); // SPRxPTH
+ cpt[2 / 2] = getword(p, 2); // SPRxPTL
+ c[0 / 2] = getword(p, 2 + 2); // SPRxPOS
+ c[2 / 2] = getword(p, 2 + 2 + 2); // SPRxCTL
+}
+
+static void set_custom(UBYTE *p)
+{
+ volatile UWORD *c = (volatile UWORD*)0xdff000;
+ p += 4;
+ for (WORD i = 0; i < 0x1fe; i += 2, c++) {
+
+ // sprites
+ if (i >= 0x120 && i < 0x180)
+ continue;
+
+ // audio
+ if (i >= 0xa0 && i < 0xe0)
+ continue;
+
+ // skip blitter start, DMACON and INTENA
+ if (i == 0x58 || i == 0x5e || i == 0x96 || i == 0x9a) {
+ p += 2;
+ continue;
+ }
+
+ // skip programmed sync registers except BEAMCON0
+ if (i >= 0x1c0 && i != 0x1fc && i != 0x1dc) {
+ p += 2;
+ continue;
+ }
+
+ UWORD v = getword(p, 0);
+ p += 2;
+
+ // BEAMCON0: PAL/NTSC only
+ if (i == 0x1dc)
+ v &= 0x20;
+ // ADKCON
+ if (i == 0x9e)
+ v |= 0x8000;
+
+ *c = v;
+ }
+}
+
+void set_custom_final(UBYTE *p)
+{
+ volatile struct Custom *c = (volatile struct Custom*)0xdff000;
+ c->intena = 0x7fff;
+ c->intreq = 0x7fff;
+ c->dmacon = 0x7fff;
+ c->dmacon = getword(p, 4 + 0x96) | 0x8000;
+ c->intena = getword(p, 4 + 0x9a) | 0x8000;
+ c->intreq = getword(p, 4 + 0x9c) | 0x8000;
+}
+
+static void set_cia(UBYTE *p, ULONG num)
+{
+ volatile struct CIA *cia = (volatile struct CIA*)(num ? 0xbfd000 : 0xbfe001);
+ volatile struct Custom *c = (volatile struct Custom*)0xdff000;
+
+ cia->ciacra &= ~(CIACRAF_START | CIACRAF_RUNMODE);
+ cia->ciacrb &= ~(CIACRBF_START | CIACRBF_RUNMODE);
+ UBYTE dummy = cia->ciaicr;
+ cia->ciaicr = 0x7f;
+ c->intreq = 0x7fff;
+
+ UBYTE flags = p[16 + 1 + 2 * 2 + 3 + 3];
+
+ cia->ciapra = p[0];
+ cia->ciaprb = p[1];
+ cia->ciaddra = p[2];
+ cia->ciaddrb = p[3];
+
+ // load timers
+ cia->ciatalo = p[4];
+ cia->ciatahi = p[5];
+ cia->ciatblo = p[6];
+ cia->ciatbhi = p[7];
+ cia->ciacra |= CIACRAF_LOAD;
+ cia->ciacrb |= CIACRBF_LOAD;
+ // load timer latches
+ cia->ciatalo = p[16 + 1];
+ cia->ciatahi = p[16 + 2];
+ cia->ciatblo = p[16 + 3];
+ cia->ciatbhi = p[16 + 4];
+
+ // load alarm
+ UBYTE *alarm = &p[16 + 1 + 2 * 2 + 3];
+ cia->ciacrb |= CIACRBF_ALARM;
+ if (flags & 2) {
+ // leave latched
+ cia->ciatodlow = alarm[0];
+ cia->ciatodmid = alarm[1];
+ cia->ciatodhi = alarm[2];
+ } else {
+ cia->ciatodhi = alarm[2];
+ cia->ciatodmid = alarm[1];
+ cia->ciatodlow = alarm[0];
+ }
+ cia->ciacrb &= ~CIACRBF_ALARM;
+
+ // load tod
+ UBYTE *tod = &p[8];
+ if (flags & 1) {
+ // leave latched
+ cia->ciatodlow = tod[0];
+ cia->ciatodmid = tod[1];
+ cia->ciatodhi = tod[2];
+ } else {
+ cia->ciatodhi = tod[2];
+ cia->ciatodmid = tod[1];
+ cia->ciatodlow = tod[0];
+ }
+}
+
+void set_cia_final(UBYTE *p, ULONG num)
+{
+ volatile struct CIA *cia = (volatile struct CIA*)(num ? 0xbfd000 : 0xbfe001);
+ UBYTE dummy = cia->ciaicr;
+ cia->ciacra = p[14] & ~CIACRAF_LOAD;
+ cia->ciacrb = p[15] & ~CIACRBF_LOAD;
+ cia->ciaicr = p[13] | CIAICRF_SETCLR;
+}
+
+static void free_allocations(struct uaestate *st)
+{
+ for (int i = st->num_allocations - 1; i >= 0; i--) {
+ struct Allocation *a = &st->allocations[i];
+ if (a->mh) {
+ Deallocate(a->mh, a->addr, a->size);
+ } else {
+ FreeMem(a->addr, a->size);
+ }
+ }
+}
+
+static UBYTE *extra_allocate(ULONG size, struct uaestate *st)
+{
+ UBYTE *b;
+
+ for (;;) {
+ b = AllocAbs(size, st->extra_mem_pointer);
+ if (b) {
+ struct Allocation *a = &st->allocations[st->num_allocations++];
+ a->addr = b;
+ a->size = size;
+ st->extra_mem_pointer += (size + 7) & ~7;
+ return b;
+ }
+ st->extra_mem_pointer += 8;
+ if (st->extra_mem_pointer + size >= st->extra_ram + st->extra_ram_size)
+ return NULL;
+ }
+}
+
+// allocate from extra mem
+static UBYTE *tempmem_allocate(ULONG size, struct uaestate *st)
+{
+ UBYTE *b = NULL;
+ if (st->extra_mem_head) {
+ b = Allocate(st->extra_mem_head, size);
+ if (b) {
+ struct Allocation *a = &st->allocations[st->num_allocations++];
+ a->mh = st->extra_mem_head;
+ a->addr = b;
+ a->size = size;
+ }
+ }
+ if (!b) {
+ b = extra_allocate(size, st);
+ }
+ return b;
+}
+
+// allocate from statefile reserved bank index
+static UBYTE *tempmem_allocate_reserved(ULONG size, WORD index, struct uaestate *st)
+{
+ struct MemoryBank *mb = &st->membanks[index];
+ if (!mb->targetsize)
+ return NULL;
+ UBYTE *addr = mb->targetaddr;
+ for (;;) {
+ addr += 65536;
+ if (addr - mb->targetaddr + size >= mb->targetsize)
+ return NULL;
+ UBYTE *b = AllocAbs(size, addr);
+ if (b) {
+ struct Allocation *a = &st->allocations[st->num_allocations++];
+ a->addr = b;
+ a->size = size;
+ return b;
+ }
+ }
+}
+
+static void load_memory(FILE *f, WORD index, struct uaestate *st)
+{
+ struct MemoryBank *mb = &st->membanks[index];
+ ULONG oldoffset = ftell(f);
+ ULONG chunksize = mb->size + 12;
+ fseek(f, mb->offset, SEEK_SET);
+ printf("Memory '%s', size %luk, offset %lu. Target %08lx.\n", mb->chunk, chunksize >> 10, mb->offset, mb->targetaddr);
+ // if Chip RAM and space in another statefile block? Put it there because chip ram is decompressed first.
+ if (index == MB_CHIP) {
+ mb->addr = tempmem_allocate_reserved(chunksize, MB_SLOW, st);
+ if (!mb->addr)
+ mb->addr = tempmem_allocate_reserved(chunksize, MB_FAST, st);
+ } else if (index == MB_SLOW) {
+ mb->addr = tempmem_allocate_reserved(chunksize, MB_FAST, st);
+ }
+ if (!mb->addr)
+ mb->addr = tempmem_allocate(chunksize, st);
+ if (mb->addr) {
+ printf(" - Address %08lx - %08lx.\n", mb->addr, mb->addr + chunksize - 1);
+ int v = fread(mb->addr, 1, chunksize, f);
+ if (v != chunksize) {
+ printf("ERROR: Read error (%lu != %lu).\n", v, chunksize);
+ st->errors++;
+ }
+ } else {
+ printf("ERROR: Out of memory.\n");
+ st->errors++;
+ }
+ fseek(f, oldoffset, SEEK_SET);
+}
+
+static int read_chunk_head(FILE *f, UBYTE *cnamep, ULONG *sizep, ULONG *flagsp)
+{
+ ULONG size = 0, flags = 0;
+ UBYTE cname[5];
+
+ *flagsp = 0;
+ *sizep = 0;
+ cnamep[0] = 0;
+ if (fread(cname, 1, 4, f) != 4) {
+ return 0;
+ }
+ cname[4] = 0;
+ strcpy(cnamep, cname);
+
+ if (fread(&size, 1, 4, f) != 4) {
+ cnamep[0] = 0;
+ return 0;
+ }
+
+ if (fread(&flags, 1, 4, f) == 0) {
+ return 1;
+ }
+
+ if (size < 8)
+ return 1;
+
+ if (size < 12) {
+ size = 0;
+ flags = 0;
+ } else {
+ size -= 12;
+ }
+ *sizep = size;
+ *flagsp = flags;
+ return 1;
+}
+
+static UBYTE *load_chunk(FILE *f, UBYTE *cname, ULONG size, struct uaestate *st)
+{
+ UBYTE *b = NULL;
+ int acate = 0;
+
+ //printf("Allocating %lu bytes for '%s'.\n", size, cname);
+
+ b = tempmem_allocate(size, st);
+
+ //printf("Reading chunk '%s', %lu bytes to address %08x\.n", cname, size, b);
+
+ if (!b) {
+ printf("ERROR: Not enough memory (%ul bytes required).\n", size);
+ return NULL;
+ }
+
+ if (fread(b, 1, size, f) != size) {
+ printf("ERROR: Read error.\n");
+ return NULL;
+ }
+
+ fseek(f, 4 - (size & 3), SEEK_CUR);
+
+ return b;
+}
+
+static UBYTE *read_chunk(FILE *f, UBYTE *cname, ULONG *sizep, ULONG *flagsp, struct uaestate *st)
+{
+ ULONG size, orgsize, flags;
+
+ if (!read_chunk_head(f, cname, &size, &flags))
+ return NULL;
+ orgsize = size;
+ *flagsp = flags;
+
+ if (size == 0)
+ return NULL;
+
+ ULONG maxsize = 0x7fffffff;
+ int found = 0;
+ for (int i = 0; chunknames[i]; i++) {
+ if (!strcmp(cname, chunknames[i])) {
+ found = 1;
+ printf("Reading chunk '%s', %lu bytes, flags %08x.\n", cname, size, flags);
+ break;
+ }
+ }
+ if (!found) {
+ // read only header if memory chunk
+ for (int i = 0; memchunknames[i]; i++) {
+ if (!strcmp(cname, memchunknames[i])) {
+ found = 1;
+ maxsize = 16;
+ printf("Checking memory chunk '%s', %lu bytes, flags %08x.\n", cname, size, flags);
+ break;
+ }
+ }
+ }
+
+ if (!found) {
+ //printf("Skipped chunk '%s', %ld bytes, flags %08x\n", cname, size, flags);
+ fseek(f, size, SEEK_CUR);
+ if (size)
+ fseek(f, 4 - (size & 3), SEEK_CUR);
+ return NULL;
+ }
+
+ *sizep = size;
+ if (size > maxsize)
+ size = maxsize;
+ UBYTE *chunk = malloc(size);
+ if (!chunk) {
+ printf("ERROR: Not enough memory.\n");
+ return NULL;
+ }
+ if (fread(chunk, 1, size, f) != size) {
+ printf("ERROR: Read error.\n");
+ free(chunk);
+ return NULL;
+ }
+ if (orgsize > size) {
+ fseek(f, orgsize - size, SEEK_CUR);
+ }
+ fseek(f, 4 - (orgsize & 3), SEEK_CUR);
+ return chunk;
+}
+
+static void find_extra_ram(struct uaestate *st)
+{
+ Forbid();
+ struct MemHeader *mh = (struct MemHeader*)SysBase->MemList.lh_Head;
+ while (mh->mh_Node.ln_Succ) {
+ ULONG mstart = ((ULONG)mh->mh_Lower) & 0xffff0000;
+ ULONG msize = ((((ULONG)mh->mh_Upper) + 0xffff) & 0xffff0000) - mstart;
+ int i;
+ for (i = 0; i < MEMORY_REGIONS; i++) {
+ if (st->mem_allocated[i] == mh)
+ break;
+ }
+ if (i == MEMORY_REGIONS) {
+ if (msize > st->extra_ram_size) {
+ st->extra_ram = (UBYTE*)mstart;
+ st->extra_ram_size = msize;
+ st->extra_mem_head = mh;
+ }
+ }
+ mh = (struct MemHeader*)mh->mh_Node.ln_Succ;
+ }
+ Permit();
+}
+
+static ULONG check_ram(UBYTE *cname, UBYTE *chunk, WORD index, ULONG addr, ULONG offset, ULONG chunksize, ULONG flags, struct uaestate *st)
+{
+ ULONG size;
+ if (flags & 1) // compressed
+ size = getlong(chunk, 0);
+ else
+ size = chunksize;
+ printf("Statefile RAM: Address %08x, size %luk.\n", addr, size >> 10);
+ int found = 0;
+ ULONG mstart, msize;
+ Forbid();
+ struct MemHeader *mh = (struct MemHeader*)SysBase->MemList.lh_Head;
+ while (mh->mh_Node.ln_Succ) {
+ mstart = ((ULONG)mh->mh_Lower) & 0xffff0000;
+ msize = ((((ULONG)mh->mh_Upper) + 0xffff) & 0xffff0000) - mstart;
+ if (mstart == addr) {
+ if (msize >= size)
+ found = 1;
+ else
+ found = -1;
+ break;
+ }
+ mh = (struct MemHeader*)mh->mh_Node.ln_Succ;
+ }
+ Permit();
+ if (!found) {
+ printf("ERROR: Not found in this system.\n");
+ st->errors++;
+ return 0;
+ }
+ st->mem_allocated[index] = mh;
+ struct MemoryBank *mb = &st->membanks[index];
+ mb->size = chunksize;
+ mb->offset = offset;
+ mb->targetaddr = (UBYTE*)addr;
+ mb->targetsize = msize;
+ mb->flags = flags;
+ strcpy(mb->chunk, cname);
+ printf("- Detected memory at %08x, total size %luk.\n", mstart, msize >> 10);
+ if (found > 0) {
+ printf("- Is usable (%luk required, %luk unused, offset %lu).\n", size >> 10, (msize - size) >> 10, offset);
+ ULONG extrasize = msize - size;
+ if (extrasize >= 524288) {
+ if ((mstart >= 0x00200000 && st->extra_ram < (UBYTE*)0x00200000) || extrasize > st->extra_ram_size) {
+ st->extra_ram = (UBYTE*)(mstart + size);
+ st->extra_ram_size = extrasize;
+ }
+ }
+ return 1;
+ }
+ printf("ERROR: Not enough memory available (%luk required).\n", size >> 10);
+ st->errors++;
+ return 0;
+}
+
+static void floppy_info(int num, UBYTE *p)
+{
+ UBYTE state = p[4];
+ UBYTE track = p[5];
+ if (state & 2) // disabled
+ return;
+ printf("DF%d: Track %d, '%s'.\n", num, track, &p[4 + 1 + 1 + 1 + 1 + 4 + 4]);
+}
+
+static void check_rom(UBYTE *p, struct uaestate *st)
+{
+ UWORD ver = getword(p, 4 + 4 + 4);
+ UWORD rev = getword(p, 4 + 4 + 4 + 2);
+
+ UWORD *rom = (UWORD*)0xf80000;
+ UWORD rver = rom[12 / 2];
+ UWORD rrev = rom[14 / 2];
+
+ ULONG start = getlong(p, 0);
+ ULONG len = getlong(p, 4);
+ if (start == 0xf80000 && len == 262144)
+ start = 0xfc0000;
+ ULONG crc32 = getlong(p, 4 + 4 + 4 + 4);
+
+ UBYTE *path = &p[4 + 4 + 4 + 4 + 4];
+ while (*path++);
+
+ printf("ROM %08lx-%08lx %d.%d (CRC=%08x).\n", start, start + len - 1, ver, rev, crc32);
+ printf("- '%s'\n", path);
+ if (ver != rver || rev != rrev) {
+ printf("WARNING: KS ROM version mismatch.\n");
+ }
+}
+
+static int parse_pass_2(FILE *f, struct uaestate *st)
+{
+ for (int i = 0; i < MEMORY_REGIONS; i++) {
+ struct MemoryBank *mb = &st->membanks[i];
+ if (mb->size) {
+ load_memory(f, i, st);
+ }
+ }
+
+ for (;;) {
+ ULONG size, flags;
+ UBYTE cname [5];
+
+ if (!read_chunk_head(f, cname, &size, &flags)) {
+ return -1;
+ }
+
+ if (!strcmp(cname, "END "))
+ break;
+
+ if (!strcmp(cname, "CPU ")) {
+ st->cpu_chunk = load_chunk(f, cname, size, st);
+ } else if (!strcmp(cname, "CHIP")) {
+ st->custom_chunk = load_chunk(f, cname, size, st);
+ } else if (!strcmp(cname, "AGAC")) {
+ st->aga_colors_chunk = load_chunk(f, cname, size, st);
+ } else if (!strcmp(cname, "CIAA")) {
+ st->ciaa_chunk = load_chunk(f, cname, size, st);
+ } else if (!strcmp(cname, "CIAB")) {
+ st->ciab_chunk = load_chunk(f, cname, size, st);
+ } else if (!strcmp(cname, "DSK0")) {
+ st->floppy_chunk[0] = load_chunk(f, cname, size, st);
+ floppy_info(0, st->floppy_chunk[0]);
+ } else if (!strcmp(cname, "DSK1")) {
+ st->floppy_chunk[1] = load_chunk(f, cname, size, st);
+ floppy_info(1, st->floppy_chunk[1]);
+ } else if (!strcmp(cname, "DSK2")) {
+ st->floppy_chunk[2] = load_chunk(f, cname, size, st);
+ floppy_info(2, st->floppy_chunk[2]);
+ } else if (!strcmp(cname, "DSK3")) {
+ st->floppy_chunk[3] = load_chunk(f, cname, size, st);
+ floppy_info(3, st->floppy_chunk[3]);
+ } else if (!strcmp(cname, "AUD0")) {
+ st->audio_chunk[0] = load_chunk(f, cname, size, st);
+ } else if (!strcmp(cname, "AUD1")) {
+ st->audio_chunk[1] = load_chunk(f, cname, size, st);
+ } else if (!strcmp(cname, "AUD2")) {
+ st->audio_chunk[2] = load_chunk(f, cname, size, st);
+ } else if (!strcmp(cname, "AUD3")) {
+ st->audio_chunk[3] = load_chunk(f, cname, size, st);
+ } else {
+ fseek(f, size, SEEK_CUR);
+ fseek(f, 4 - (size & 3), SEEK_CUR);
+ }
+ }
+
+ return st->errors;
+}
+
+static int parse_pass_1(FILE *f, struct uaestate *st)
+{
+ int first = 1;
+ UBYTE *b = NULL;
+
+ for (;;) {
+ ULONG offset = ftell(f);
+ ULONG size, flags;
+ UBYTE cname[5];
+ b = read_chunk(f, cname, &size, &flags, st);
+ if (!strcmp(cname, "END "))
+ break;
+ if (!b) {
+ if (!cname[0])
+ return -1;
+ continue;
+ }
+
+ if (first) {
+ if (strcmp(cname, "ASF ")) {
+ printf("ERROR: Not UAE statefile.\n");
+ return -1;
+ }
+ first = 0;
+ continue;
+ }
+
+ if (!strcmp(cname, "CPU ")) {
+ ULONG smodel = 68000;
+ for (int i = 0; i < 4; i++) {
+ if (SysBase->AttnFlags & (1 << i))
+ smodel += 10;
+ }
+ if (SysBase->AttnFlags & 0x80)
+ smodel = 68060;
+ ULONG model = getlong(b, 0);
+ if (smodel != model) {
+ printf("- WARNING: %lu CPU statefile.\n", model);
+ }
+ if (model > 68020) {
+ printf("- ERROR: Only 68000/68010/68020 statefiles are supported.\n");
+ st->errors++;
+ }
+ } else if (!strcmp(cname, "CHIP")) {
+ UWORD vposr = getword(b, 4 + 4); // VPOSR
+ volatile struct Custom *c = (volatile struct Custom*)0xdff000;
+ UWORD svposr = c->vposr;
+ int aga = (vposr & 0x0f00) == 0x0300;
+ int ecs = (vposr & 0x2000) == 0x2000;
+ int ntsc = (vposr & 0x1000) == 0x1000;
+ int saga = (svposr & 0x0f00) == 0x0300;
+ int secs = (svposr & 0x2000) == 0x2000;
+ int sntsc = (svposr & 0x1000) == 0x1000;
+ printf("Chipset: %s %s (%04X).\n", aga ? "AGA" : (ecs ? "ECS" : "OCS"), ntsc ? "NTSC" : "PAL", vposr);
+ if (aga && !saga) {
+ printf("- WARNING: AGA statefile.\n");
+ }
+ if (saga && !aga) {
+ printf("- WARNING: OCS/ECS statefile.\n");
+ }
+ if (!sntsc && !ecs && ntsc) {
+ printf("- WARNING: NTSC statefile.\n");
+ }
+ if (sntsc && !ecs && !ntsc) {
+ printf("- WARNING: PAL statefile.\n");
+ }
+ } else if (!strcmp(cname, "CRAM")) {
+ check_ram(cname, b, MB_CHIP, 0x000000, offset, size, flags, st);
+ } else if (!strcmp(cname, "BRAM")) {
+ check_ram(cname, b, MB_SLOW, 0xc00000, offset, size, flags, st);
+ } else if (!strcmp(cname, "FRAM")) {
+ check_ram(cname, b, MB_FAST, 0x200000, offset, size, flags, st);
+ } else if (!strcmp(cname, "ROM ")) {
+ check_rom(b, st);
+ }
+
+ free(b);
+ b = NULL;
+ }
+
+ if (!st->errors) {
+ find_extra_ram(st);
+ if (!st->extra_ram) {
+ printf("ERROR: At least 512k unused RAM required.\n");
+ st->errors++;
+ } else {
+ printf("%luk extra RAM at %08x.\n", st->extra_ram_size >> 10, st->extra_ram);
+ st->extra_mem_pointer = st->extra_ram;
+ st->errors = 0;
+ }
+ } else {
+ printf("ERROR: Incompatible hardware configuration.\n");
+ st->errors++;
+ }
+
+ free(b);
+
+ return st->errors;
+}
+
+extern void runit(void*);
+extern void callinflate(UBYTE*, UBYTE*);
+
+static void handlerambank(struct MemoryBank *mb, struct uaestate *st)
+{
+ UBYTE *sa = mb->addr + 16; /* skip chunk header + RAM size */
+ if (mb->flags & 1) {
+ // +2 = skip zlib header
+ callinflate(mb->targetaddr, sa + 2);
+ } else {
+ ULONG *s = (ULONG*)sa;
+ ULONG *d = (ULONG*)mb->targetaddr;
+ for (int i = 0; i < mb->size / 4; i++) {
+ *d++ = *s++;
+ }
+ }
+}
+
+// Interrupts are off, supervisor state
+static void processstate(struct uaestate *st)
+{
+ volatile struct Custom *c = (volatile struct Custom*)0xdff000;
+
+ for (int i = 0; i < MEMORY_REGIONS; i++) {
+ if (i == MB_CHIP)
+ c->color[0] = 0x800;
+ if (i == MB_SLOW)
+ c->color[0] = 0x080;
+ if (i == MB_FAST)
+ c->color[0] = 0x008;
+ struct MemoryBank *mb = &st->membanks[i];
+ if (mb->addr) {
+ handlerambank(mb, st);
+ }
+ }
+ c->color[0] = 0x880;
+
+ // must be before set_cia
+ for (int i = 0; i < 4; i++) {
+ set_floppy(st->floppy_chunk[i], i);
+ }
+
+ c->color[0] = 0x808;
+
+ set_agacolor(st->aga_colors_chunk);
+ set_custom(st->custom_chunk);
+ for (int i = 0; i < 4; i++) {
+ set_audio(st->audio_chunk[i], i);
+ }
+ for (int i = 0; i < 8; i++) {
+ set_sprite(st->sprite_chunk[i], i);
+ }
+ set_cia(st->ciaa_chunk, 0);
+ set_cia(st->ciab_chunk, 1);
+
+ c->color[0] = 0x888;
+
+ runit(st);
+}
+
+static void take_over(struct uaestate *st)
+{
+ // Copy stack, variables and code to safe location
+
+ UBYTE *tempsp = tempmem_allocate(TEMP_STACK_SIZE, st);
+ if (!tempsp) {
+ printf("Out of memory for temp stack (%lu bytes).\n", TEMP_STACK_SIZE);
+ return;
+ }
+
+ struct uaestate *tempst = (struct uaestate*)tempmem_allocate(sizeof(struct uaestate), st);
+ if (!tempst) {
+ printf("Out of memory for temp state variables (%lu bytes).\n", sizeof(struct uaestate));
+ return;
+ }
+ memcpy(tempst, st, sizeof(struct uaestate));
+
+ struct Process *me = (struct Process*)FindTask(0);
+ struct CommandLineInterface *cli = (struct CommandLineInterface*)((((ULONG)me->pr_CLI) << 2));
+ if (!cli) {
+ printf("CLI == NULL?\n");
+ return;
+ }
+ ULONG *module = (ULONG*)(cli->cli_Module << 2);
+ ULONG hunksize = module[-1] << 2;
+ UBYTE *newcode = tempmem_allocate(hunksize, st);
+ if (!newcode) {
+ printf("Out of memory for temp code (%lu bytes).\n", hunksize);
+ return;
+ }
+ memcpy(newcode, module, hunksize);
+
+ // ugly relocation hack but jumps to other module (asm.S) are always absolute..
+ // TODO: process the executable after linking
+ UWORD *cp = (UWORD*)newcode;
+ for (int i = 0; i < hunksize / 2; i++) {
+ // JSR/JMP xxxxxxxx.L?
+ if (*cp == 0x4eb9 || *cp == 0x4ef9) {
+ ULONG *ap = (ULONG*)(cp + 1);
+ ULONG *app = (ULONG*)(*ap);
+ void *addr = (void*)app;
+ if (addr == runit || addr == callinflate) {
+ *ap = (ULONG)addr - (ULONG)module + (ULONG)newcode;
+ //printf("Relocated %08x: %08x -> %08x\n", cp, addr, *ap);
+ }
+ }
+ cp++;
+ }
+
+ printf("Code=%08lx Stack=%08lx Data=%08lx. Press RETURN!\n", newcode, tempsp, tempst);
+ Delay(100); // So that key release gets processed by AmigaOS
+
+#if 0
+ if (SysBase->LibNode.lib_Version >= 37) {
+ CacheClearU();
+ }
+#endif
+
+ UBYTE b;
+ fread(&b, 1, 1, stdin);
+
+ if (GfxBase->LibNode.lib_Version >= 37) {
+ LoadView(NULL);
+ WaitTOF();
+ WaitTOF();
+ }
+
+ // No turning back!
+ extern void *killsystem(UBYTE*, struct uaestate*, ULONG);
+ killsystem(tempsp + TEMP_STACK_SIZE, tempst, (ULONG)processstate - (ULONG)module + (ULONG)newcode);
+}
+
+int main(int argc, char *argv[])
+{
+ FILE *f;
+ UBYTE *b;
+ ULONG size;
+ UBYTE cname[5];
+ struct uaestate *st;
+
+ if (argc < 2) {
+ printf("Statefile parameter missing.\n");
+ return 0;
+ }
+
+ f = fopen(argv[1], "rb");
+ if (!f) {
+ printf("Couldn't open '%s'\n", argv[1]);
+ return 0;
+ }
+
+ st = calloc(sizeof(struct uaestate), 1);
+ if (!st) {
+ printf("Out of memory.\n");
+ return 0;
+ }
+
+ if (!parse_pass_1(f, st)) {
+ fseek(f, 0, SEEK_SET);
+ if (!parse_pass_2(f, st)) {
+ take_over(st);
+ } else {
+ printf("Pass #2 failed (%ld errors).\n", st->errors);
+ }
+ } else {
+ printf("Pass #1 failed (%ld errors).\n", st->errors);
+ }
+
+ free(st);
+
+ fclose(f);
+
+ free_allocations(st);
+
+ return 0;
+}