From 60860105989690e22dafd7bbec6c00028172b525 Mon Sep 17 00:00:00 2001 From: Toni Wilen Date: Sun, 5 Jan 2020 12:56:07 +0200 Subject: [PATCH] Data file gzip support. --- cputest/asm.S | 9 + cputest/inflate.S | 778 ++++++++++++++++++++++++++++++++++++++++++++ cputest/main.c | 243 ++++++++++---- cputest/makefile | 6 +- cputest/makefile.st | 6 +- 5 files changed, 969 insertions(+), 73 deletions(-) create mode 100644 cputest/inflate.S diff --git a/cputest/asm.S b/cputest/asm.S index c37f5f82..a00440b2 100644 --- a/cputest/asm.S +++ b/cputest/asm.S @@ -1,6 +1,7 @@ .text + .globl _callinflate .globl _execute_test000 .globl _execute_test010 .globl _execute_test020 @@ -37,6 +38,14 @@ S_TRACECNT = S_FPSR+4 S_TRACESTACK = S_TRACECNT+4 S_NEXT = S_TRACESTACK+12 +_callinflate: + movem.l a4-a5,-(sp) + move.l 4+2*4(sp),a4 + move.l 8+2*4(sp),a5 + bsr _inflate + movem.l (sp)+,a4-a5 + rts + | set CPU special registers _setcpu: move.l 4(sp),d1 | cpu_lvl diff --git a/cputest/inflate.S b/cputest/inflate.S new file mode 100644 index 00000000..bdd5a099 --- /dev/null +++ b/cputest/inflate.S @@ -0,0 +1,778 @@ + + .chip 68020 + .globl _inflate + + +/* + * inflate.S + * + * Decompression of DEFLATE streams, as produced by zip/gzip/pkzip and + * specified in RFC 1951 "DEFLATE Compressed Data Format Specification". + * + * Usage: Optionally configure the OPT_xxx options below at build time; + * at run time 'bsr inflate' with arguments: + * a4 = output buffer, a5 = input stream + * a6 = *end* of temporary storage area (only if OPT_STORAGE_OFFSTACK) + * All register values (including arguments) are preserved. + * + * Space requirements: 638-930 bytes code; 2044-2940 bytes stack. + * (NB1. Above ranges are [No Optimisations]-[All Optimisations]) + * (NB2. Stack space can be relocated to a separately-specified storage + * area, see OPT_STORAGE_OFFSTACK below) + * + * Timings: With all Optimisation Options enabled (see below) this routine + * will decompress on a basic 7MHz 68000 at ~25kB/s. An AmigaDOS track of + * data (5.5kB) is processed in ~220ms. This is only fractionally slower than + * the track can be fetched from disk, hence there is scope for a + * decompressing loader to keep CPU and disk both at near 100% utilisation. + * + * Written & released by Keir Fraser + * + * This is free and unencumbered software released into the public domain. + * See the file COPYING for more details, or visit . + */ + +/* Optimisation Option #1: + * Avoid long Huffman-tree walks by indexing the first 8 bits of each codeword + * in a 256-entry lookup table. This shortens all walks by 8 steps and since + * the most common codes are less than 8 bits, most tree walks are avoided. + * Also pre-shifts selected symbols in the code->symbol table, ready to be used + * as indexes into further lookup tables. + * SPEEDUP: 41% (c.w. no Options); COST: 122 bytes code, 896 bytes stack */ +#ifndef OPT_TABLE_LOOKUP +#define OPT_TABLE_LOOKUP 1 +#endif + +/* Optimisation Option #2: + * Inline functions in the main decode loop to avoid all BSR/RTS pairs. + * SPEEDUP: 15% (on top of Option #1); COST: 164 bytes code */ +#ifndef OPT_INLINE_FUNCTIONS +#define OPT_INLINE_FUNCTIONS 1 +#endif + +/* Optimisation Option #3: + * Unroll the copy loop for tuples by one iteration + * (so two bytes are copied per iteration). + * SPEEDUP: ~1% (on top of Options #1 and #2); COST: 6 bytes code */ +#ifndef OPT_UNROLL_COPY_LOOP +#define OPT_UNROLL_COPY_LOOP 1 +#endif + +/* Storage Option: + * All but 12 bytes of this routine's space requirement can be allocated + * off stack, in a data area specified in register a6. + * If this option is set then inflate must be called with a6 pointing at + * the *end* of the reserved storage area (+2032 or +2928 bytes, depending + * on whether OPT_TABLE_LOOKUP is enabled). + * SPEEDUP: none; COST: -2 bytes code (makes code slightly smaller) */ +#ifndef OPT_STORAGE_OFFSTACK +#define OPT_STORAGE_OFFSTACK 0 +#endif + +/* By default all lookup/conversion tables are generated on-the-fly on every + * call to inflate. In some cases this can be very inefficient. + * If this option is enabled then two new routines are generated: At start-of- + * day call 'inflate_gentables' with a6 pointing to the *end* of a 6000-byte + * block of memory. Then call 'inflate_fromtables' instead of 'inflate', with + * a6 still pointing to the end of the pre-generated memory block. + * SPEEDUP: variable; COST: 116 bytes code */ +#ifndef OPT_PREGENERATE_TABLES +#define OPT_PREGENERATE_TABLES 0 +#endif + +/* By default all registers are saved/restored across 'inflate' and + * 'inflate_fromtables'. This set can be reduced below. Note that if + * a4 is not saved then it will point at the end of the uncompressed output. + * If a5 is not saved then it will point at the end of the DEFLATE stream. */ +#ifndef SAVE_RESTORE_REGS +#define SAVE_RESTORE_REGS d0-d6/a0-a3 +#endif + +#if OPT_STORAGE_OFFSTACK +#define aS a6 +#else +#define aS sp +#endif + +/* Longest possible code. */ +#define MAX_CODE_LEN 16 + +/* (Maximum) alphabet sizes. */ +#define nr_codelen_symbols 19 +#define nr_litlen_symbols 288 +#define nr_distance_symbols 32 + +/* Alphabet-description stream for a static Huffman block (BTYPE=01b). */ +static_huffman_prefix: + dc.b 0xff, 0x5b, 0x00, 0x6c, 0x03, 0x36, 0xdb + dc.b 0xb6, 0x6d, 0xdb, 0xb6, 0x6d, 0xdb, 0xb6 + dc.b 0xcd, 0xdb, 0xb6, 0x6d, 0xdb, 0xb6, 0x6d + dc.b 0xdb, 0xa8, 0x6d, 0xce, 0x8b, 0x6d, 0x3b + +#if OPT_TABLE_LOOKUP + +/* Number of bytes required for code-lookup table/tree: + * - 256 2-byte entries for the 8-bit lookup table + * - Worst-case only 8 symbols decode directly in the table and all the rest + * are in a tree hanging off one table entry. This tree requires + * (nr_symbols-8)-1 internal 4-byte nodes. */ +#define LOOKUP_BYTES(nr_syms) (256*2+((nr_syms)-9)*4) + + /* a0 = len[], a1 = nodes[], d0 = nr_symbols */ + /* d1 = symbol beyond which all symbols get <<2 */ + /* a2-a3 are scratched */ +build_code: + movem.l d0-d7,-(aS) + + /* Allocate space for bl_count[]/next_code[] array on stack. */ + moveq #(MAX_CODE_LEN+1)/2,d1 + moveq #0,d2 +1: move.l d2,-(aS) + dbf d1,1b + + /* Count occurrences of each code length into bl_count[] array. */ + subq.w #1,d0 + move.w d0,d1 + move.l a0,a2 /* a2 = &len[0] */ +1: move.b (a2)+,d2 /* d2 = len[i] */ +#if MC68020 + addq.w #1,(aS,d2.w*2) +#else + add.b d2,d2 + addq.w #1,(aS,d2.w) /* bl_count[len[i]]++ */ +#endif + dbf d1,1b + + /* Calculate next_code[] start values for each code length. */ + move.l aS,a2 /* a2 = bl_count[] / next_code[] */ + moveq #MAX_CODE_LEN-1,d1 + moveq #0,d2 /* d2 = code */ + move.w d2,(aS) /* bl_count[0] = 0, ignore zero-length codes */ +1: add.w (a2),d2 + add.w d2,d2 /* code = (code + bl_count[i-1]) << 1 */ + move.w d2,(a2)+ /* next_code[i] = code */ + dbf d1,1b + + /* Create the Huffman-code lookup tree */ + move.w d0,d1 + moveq #127,d4 /* d4 = next_node = 127 */ + move.l a0,a2 /* a2 = &len[0] */ +1: moveq #0,d5 + move.b (a2)+,d5 /* d5 = len[i] / *len++ */ + jeq 4f + subq.w #1,d5 + move.w d5,d6 +#if MC68020 + move.w (aS,d6.w*2),d3 + addq.w #1,(aS,d6.w*2) +#else + add.w d6,d6 + move.w (aS,d6.w),d3 /* d3 = code = next_code[len[i]]++ */ + addq.w #1,(aS,d6.w) + move.w d5,d6 +#endif + + moveq #0,d2 +9: lsr.w #1,d3 + roxl.w #1,d2 + dbf d6,9b /* d5 = codelen-1; d2 = reversed code */ + move.b d2,d3 + add.w d3,d3 /* d3 = table offset */ + move.w d0,d6 + sub.w d1,d6 /* d6 = symbol */ + cmp.w (((MAX_CODE_LEN+1)/2)+1)*4+6(aS),d6 /* symbol > saved d1.w? */ + jls 9f + lsl.w #2,d6 /* symbol <<= 2 if so */ +9: cmp.b #9-1,d5 + jcc codelen_gt_8 + +codelen_le_8: /* codelen <= 8: leaf in table entry(s) */ + lsl.w #3,d6 + or.b d5,d6 /* d6 = (symbol<<3) | (codelen-1) */ + moveq #0,d2 + addq.b #2,d5 + bset d5,d2 /* d2 = 1<<(codelen+1) [table step] */ + move.w d2,d7 + neg.w d7 + and.w #511,d7 + or.w d7,d3 /* d3 = last table offset */ +9: move.w d6,(a1,d3.w) + sub.w d2,d3 + jcc 9b + jra 4f + +codelen_gt_8: /* codelen > 8: requires a tree walk */ + lsr.w #8,d2 + subq.b #8,d5 /* Skip the first 8 bits of code */ + lea (a1,d3.w),a3 /* pnode = table entry */ + +2: /* Walk through *pnode. */ + move.w (a3),d7 /* d3 = *pnode */ + jne 3f + /* Link missing: Create a new internal node */ + addq.w #1,d4 + move.w d4,d7 + bset #15,d7 + move.w d7,(a3) /* *pnode = ++next_node | INTERNAL */ +3: /* Take left or right branch depending on next code bit */ + lsr.b #1,d2 + addx.w d7,d7 +#if MC68020 + lea (a1,d7.w*2),a3 +#else + add.w d7,d7 + lea (a1,d7.w),a3 /* pnode = next_bit ? &node->r : &node->l */ +#endif +3: dbf d5,2b + + /* Insert the current symbol as a new leaf node */ + move.w d6,(a3) /* *pnode = sym */ +4: dbf d1,1b + + lea (((MAX_CODE_LEN+1)/2)+1)*4(aS),aS + movem.l (aS)+,d0-d7 + rts + + /* d5-d6/a5 = stream, a0 = tree */ + /* d0.w = result, d1.l = scratch */ +.macro STREAM_NEXT_SYMBOL + moveq #0,d0 /* 4 */ + moveq #7,d1 /* 4 */ + cmp.b d1,d6 /* 4 */ + jhi 99f /* 10 */ + /* Less than 8 bits cached; grab another byte from the stream */ + move.b (a5)+,d0 /* [8] */ + lsl.w d6,d0 /* [~14] */ + or.w d0,d5 /* [4] */ /* s->cur |= *p++ << s->nr */ + addq.b #8,d6 /* [4] */ /* s->nr += 8 */ + moveq #0,d0 /* [4] */ +99: /* Use next input byte as index into code lookup table */ + move.b d5,d0 /* 4 */ +#if MC68020 + move.w (a0,d0.w*2),d0 +#else + add.w d0,d0 /* 4 */ + move.w (a0,d0.w),d0 /* 14 */ +#endif + jpl 99f /* 10 (taken) */ + /* Code is longer than 8 bits: do the remainder via a tree walk */ + lsr.w #8,d5 + subq.b #8,d6 /* consume 8 bits from the stream */ +98: /* stream_next_bits(1), inlined & optimised */ + subq.b #1,d6 /* 4 cy */ + jcc 97f /* 10 cy (taken) */ + move.b (a5)+,d5 /* [8 cy] */ + moveq #7,d6 /* [4 cy] */ +97: lsr.w #1,d5 /* 8 cy */ + addx.w d0,d0 /* 4 cy */ +#if MC68020 + move.w (a0,d0.w*2),d0 +#else + add.w d0,d0 /* 4 cy */ + move.w (a0,d0.w),d0 /* 14 cy */ +#endif + jmi 98b /* 10 cy (taken); loop on INTERNAL flag */ + jra 98f /* TOTAL LOOP CYCLES ~= 54 */ +99: /* Symbol found directly: consume bits and return symbol */ + and.b d0,d1 /* 4 */ + addq.b #1,d1 /* 4 */ + lsr.w d1,d5 /* ~16 */ /* consume bits from the stream */ + sub.b d1,d6 /* 4 */ + lsr.w #3,d0 /* 12 */ /* d0 = symbol */ +98: /* ~94 CYCLES TOTAL [+ 34] */ +.endm + +#else /* !OPT_TABLE_LOOKUP */ + +/* Number of bytes required for code-lookup tree: + * - Every binary tree with N leaves has N-1 internal nodes. + * - Internal nodes require 4 bytes each. Leaves are free. */ +#define LOOKUP_BYTES(nr_syms) (((nr_syms)-1)*4) + + /* a0 = len[], a1 = nodes[], d0 = nr_symbols */ + /* a2-a3 are scratched */ +build_code: + movem.l d0-d5,-(aS) + + /* Allocate space for bl_count[]/next_code[] array on stack. */ + moveq #(MAX_CODE_LEN+1)/2,d1 + moveq #0,d2 +1: move.l d2,-(aS) + dbf d1,1b + + /* Count occurrences of each code length into bl_count[] array. */ + subq.w #1,d0 + move.w d0,d1 + move.l a0,a2 /* a2 = &len[0] */ +1: move.b (a2)+,d2 /* d2 = len[i] */ + add.b d2,d2 + addq.w #1,(aS,d2.w) /* bl_count[len[i]]++ */ + dbf d1,1b + + /* Calculate next_code[] start values for each code length. */ + move.l aS,a2 /* a2 = bl_count[] / next_code[] */ + moveq #MAX_CODE_LEN-1,d1 + moveq #0,d2 /* d2 = code */ + move.w d2,(aS) /* bl_count[0] = 0, ignore zero-length codes */ +1: add.w (a2),d2 + add.w d2,d2 /* code = (code + bl_count[i-1]) << 1 */ + move.w d2,(a2)+ /* next_code[i] = code */ + dbf d1,1b + + /* Create the Huffman-code lookup tree */ + move.w d0,d1 + moveq #0,d4 /* d4 = next_node */ + move.l a0,a2 /* a2 = &len[0] */ +1: moveq #0,d5 + move.b (a2)+,d5 /* d5 = len[i] / *len++ */ + jeq 4f + subq.w #1,d5 + add.w d5,d5 + move.w (aS,d5.w),d3 /* d3 = code = next_code[len[i]]++ */ + addq.w #1,(aS,d5.w) + lsr.w #1,d5 + /* Walk down the tree, creating nodes as necessary */ + moveq #0,d2 /* d2 = 0 (root node) */ + jra 3f + +2: /* Walk through *pnode. */ + move.w (a3),d2 /* d2 = *pnode */ + jne 3f + /* Link missing: Create a new internal node */ + addq.w #1,d4 + move.w d4,d2 + bset #15,d2 + move.w d2,(a3) /* *pnode = ++next_node | INTERNAL */ +3: /* Take left or right branch depending on next code bit */ + lsl.w #2,d2 + btst d5,d3 + jeq 3f + addq.w #2,d2 +3: lea (a1,d2.w),a3 /* pnode = next_bit ? &node->r : &node->l */ + dbf d5,2b + + /* Insert the current symbol as a new leaf node */ + move.w d0,d2 + sub.w d1,d2 + move.w d2,(a3) /* *pnode = sym */ +4: dbf d1,1b + + lea (((MAX_CODE_LEN+1)/2)+1)*4(aS),aS + movem.l (aS)+,d0-d5 + rts + + /* d5-d6/a5 = stream, a0 = tree */ + /* d0.w = result */ +.macro STREAM_NEXT_SYMBOL + moveq #0,d0 +99: /* stream_next_bits(1), inlined & optimised */ + subq.b #1,d6 /* 4 cy */ + jcc 98f /* 10 cy (taken) */ + move.b (a5)+,d5 /* [8 cy] */ + moveq #7,d6 /* [4 cy] */ +98: lsr.w #1,d5 /* 8 cy */ + addx.w d0,d0 /* 4 cy */ + add.w d0,d0 /* 4 cy */ + move.w (a0,d0.w),d0 /* 14 cy */ + jmi 99b /* 10 cy (taken); loop on INTERNAL flag set */ + /* TOTAL LOOP CYCLES ~= 54 */ +.endm + +#endif + + /* d1.b = nr, d5-d6/a5 = stream [fetched_bits/nr_fetched_bits/inp] */ + /* d0.w = result */ +.macro STREAM_NEXT_BITS +99: moveq #0,d0 + cmp.b d1,d6 + jcc 99f /* while (s->nr < nr) */ + move.b (a5)+,d0 + lsl.l d6,d0 + or.l d0,d5 /* s->cur |= *p++ << s->nr */ + addq.b #8,d6 /* s->nr += 8 */ + jra 99b +99: bset d1,d0 + subq.w #1,d0 /* d0 = (1<cur & ((1<cur >>= nr */ + sub.b d1,d6 /* s->nr -= nr */ +.endm + +#if OPT_INLINE_FUNCTIONS +#define INLINE_stream_next_bits STREAM_NEXT_BITS +#define INLINE_stream_next_symbol STREAM_NEXT_SYMBOL +#else +#define INLINE_stream_next_bits jbsr stream_next_bits +#define INLINE_stream_next_symbol jbsr stream_next_symbol +#endif + +stream_next_bits: + STREAM_NEXT_BITS + rts + + /* d5-d6/a5 = stream, a4 = output */ + /* d0-d1 are scratched */ +uncompressed_block: +#if OPT_TABLE_LOOKUP + /* Push whole bytes back into input stream. */ + lsr.w #3,d6 + sub.w d6,a5 +#else + /* No need to push bytes back into input stream because stream_next_ + * {bits,symbol} will never leave more than 7 bits cached. */ +#endif + /* Snap input stream up to byte boundary. */ + moveq #0,d5 + moveq #0,d6 + /* Read block header and copy LEN bytes. */ + moveq #16,d1 + jbsr stream_next_bits /* LEN */ + addq.w #2,a5 /* skip NLEN */ + subq.w #1,d0 /* d0.w = len-1 (for dbf) */ +1: move.b (a5)+,(a4)+ + dbf d0,1b + rts + +#define o_hdist /*0*/ +#define o_hlit 2 +#define o_lens (o_hlit+2) +#define o_codelen_tree (o_lens+nr_litlen_symbols+nr_distance_symbols) +#if OPT_TABLE_LOOKUP +/* Lit/len and codelen lookup structures share space. */ +#define o_litlen_tree o_codelen_tree +#else +#define o_litlen_tree (o_codelen_tree+LOOKUP_BYTES(nr_codelen_symbols)) +#endif +#define o_dist_tree (o_litlen_tree+LOOKUP_BYTES(nr_litlen_symbols)) +#define o_stream (o_dist_tree+LOOKUP_BYTES(nr_distance_symbols)) +#define o_frame (o_stream+3*4) +#if OPT_STORAGE_OFFSTACK +#define o_mode (o_frame) +#else +/* Allow for BSR return address from decoder */ +#define o_mode (o_frame+4) +#endif +#define o_dist_extra (o_mode+4) +#define o_length_extra (o_dist_extra+30*4) + + /* d5-d6/a5 = stream, a4 = output */ + /* d0-d4,a0-a3 are scratched */ +static_huffman: + movem.l d5-d6/a5,-(aS) + moveq #0,d5 + moveq #0,d6 + lea static_huffman_prefix(pc),a5 + move.w #o_stream/4-2,d0 + jra 1f + + /* d5-d6/a5 = stream, a4 = output */ + /* d0-d4,a0-a3 are scratched */ +dynamic_huffman: + /* Allocate stack space for len[] and node[] arrays */ + move.w #o_frame/4-2,d0 +1: moveq #0,d1 +1: move.l d1,-(aS) + dbf d0,1b + /* HLIT = stream_next_bits(5) + 257 */ + moveq #5,d1 + jbsr stream_next_bits + add.w #257,d0 + move.w d0,-(aS) + /* HDIST = stream_next_bits(5) + 1 */ + moveq #5,d1 + jbsr stream_next_bits + addq.w #1,d0 + move.w d0,-(aS) + /* HCLEN = stream_next_bits(4) + 4 */ + moveq #4,d1 + jbsr stream_next_bits + addq.w #4-1,d0 /* -1 for dbf */ + /* Initialise len[] array with code-length symbol code lengths */ + lea codelen_order(pc),a1 + lea o_lens(aS),a0 /* a0 = len[] */ + moveq #0,d2 + move.w d0,d3 +1: moveq #3,d1 + jbsr stream_next_bits + move.b (a1)+,d2 + move.b d0,(a0,d2.w) /* len[codelen_order[i++]] = next_bits(3) */ + dbf d3,1b + /* Build the codelen_tree */ + lea o_codelen_tree(aS),a1 + moveq #nr_codelen_symbols,d0 +#if OPT_TABLE_LOOKUP + moveq #127,d1 /* don't left-shift any symbols */ +#endif + jbsr build_code /* build_code(codelen_tree) */ + /* Read the literal/length & distance code lengths */ + move.w o_hlit(aS),d2 + add.w o_hdist(aS),d2 + subq.w #1,d2 /* d2 = hlit+hdist-1 */ + move.l a0,a2 /* a2 = len[] */ + move.l a1,a0 /* a0 = a1 = codelen_tree */ +1: INLINE_stream_next_symbol + cmp.b #16,d0 + jcs c_lit + jeq c_16 + cmp.b #17,d0 + jeq c_17 +c_18: /* 18: Repeat zero N times */ + moveq #7,d1 + jbsr stream_next_bits + addq.w #11-3,d0 + jra 2f +c_17: /* 17: repeat zero N times */ + moveq #3,d1 + jbsr stream_next_bits +2: moveq #0,d1 + jra 3f +c_16: /* 16: repeat previous N times */ + moveq #2,d1 + jbsr stream_next_bits + move.b -1(a2),d1 +3: addq.w #3-1,d0 + sub.w d0,d2 +4: move.b d1,(a2)+ + dbf d0,4b + jra 5f +c_lit: /* 0-16: Literal symbol */ + move.b d0,(a2)+ +5: dbf d2,1b + /* Build the lit/len and distance trees */ +#if OPT_TABLE_LOOKUP + /* Clear the codelen tree (shared space with lit/len tree). + * NB. a0 = a1 = codelen_tree = litlen_tree */ + moveq #0,d0 + move.w #LOOKUP_BYTES(nr_codelen_symbols)/4-1,d1 +1: move.l d0,(a0)+ + dbf d1,1b + /* litlen_tree (= codelen_tree) is already in a1, and now zeroed. */ +#else + lea o_litlen_tree(aS),a1 +#endif + lea o_lens(aS),a0 + move.w o_hlit(aS),d0 +#if OPT_TABLE_LOOKUP + move.w #256,d1 + move.w d1,d4 /* left-shift symbols >127 (i.e., lengths) */ +#endif + jbsr build_code /* build_code(litlen_tree) */ + add.w d0,a0 + lea o_dist_tree(aS),a1 + move.w o_hdist(aS),d0 +#if OPT_TABLE_LOOKUP + moveq #0,d1 /* left-shift all symbols (i.e., distances) */ +#endif + jbsr build_code /* build_code(dist_tree) */ + /* Reinstate the main stream if we used the static prefix */ + tst.l o_stream+8(aS) + jeq decode_loop + movem.l o_stream(aS),d5-d6/a5 + /* Now decode the compressed data stream up to EOB */ +decode_loop: + lea o_litlen_tree(aS),a0 + /* START OF HOT LOOP */ +2: INLINE_stream_next_symbol /* litlen_sym */ +#if OPT_TABLE_LOOKUP + cmp.w d4,d0 /* 4 cy (d4.w = 256) */ +#else + cmp.w #256,d0 /* 8 cy */ +#endif + jcc 2f /* 8 cy */ + /* 0-255: Byte literal */ + move.b d0,(a4)+ /* 8 cy */ + jra 2b /* 10 cy */ + /* END OF HOT LOOP -- 30 + ~108 + [34] = ~160 CYCLES */ +9: /* 256: End-of-block: we're done */ + lea o_frame(aS),aS + rts +2: jeq 9b + /* 257+: pair */ +#if !OPT_TABLE_LOOKUP /* Already shifted in case of OPT_TABLE_LOOKUP */ + lsl.w #2,d0 +#endif + lea o_length_extra-257*4(aS),a2 + add.w d0,a2 + move.w (a2)+,d1 + INLINE_stream_next_bits + add.w (a2),d0 + move.w d0,d3 /* d3 = cplen */ + lea o_dist_tree(aS),a0 + INLINE_stream_next_symbol /* dist_sym */ +#if !OPT_TABLE_LOOKUP /* Already shifted in case of OPT_TABLE_LOOKUP */ + lsl.w #2,d0 +#endif + lea o_dist_extra(aS),a2 + add.w d0,a2 + move.w (a2)+,d1 + INLINE_stream_next_bits + add.w (a2),d0 /* d0 = cpdst */ + move.l a4,a0 + sub.w d0,a0 /* a0 = outp - cpdst */ +#if OPT_UNROLL_COPY_LOOP + lsr.w #1,d3 + jcs 4f + subq.w #1,d3 +3: move.b (a0)+,(a4)+ +4: move.b (a0)+,(a4)+ +#else + subq.w #1,d3 +3: move.b (a0)+,(a4)+ +#endif + dbf d3,3b + jra decode_loop + +#if !OPT_INLINE_FUNCTIONS +stream_next_symbol: + STREAM_NEXT_SYMBOL + rts +#endif + + /* Build a base/extra-bits table on the stack. + * d0 = #pairs-1, d2 = max_value, d4 = log_2(extrabits_repeat) */ +build_base_extrabits: +#if !OPT_STORAGE_OFFSTACK + move.l (sp)+,a0 +#endif +1: move.w d0,d3 + lsr.w d4,d3 + subq.w #1,d3 + jcc 2f + moveq #0,d3 +2: moveq #0,d1 + bset d3,d1 /* d1 = 1 << extrabits */ + sub.w d1,d2 /* d2 = base */ + move.w d2,-(aS) + move.w d3,-(aS) + dbf d0,1b +#if !OPT_STORAGE_OFFSTACK + jmp (a0) +#else + rts +#endif + +dispatch: /* Decoder dispatch table. */ + dc.b uncompressed_block - uncompressed_block + dc.b static_huffman - uncompressed_block + dc.b dynamic_huffman - uncompressed_block + +codelen_order: /* Order of code lengths for the code length alphabet. */ + dc.b 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 + + /* a4 = output, a5 = input, all regs preserved + * a6 = *end* of storage area (only if OPT_STORAGE_OFFSTACK) */ +_inflate: + movem.l SAVE_RESTORE_REGS,-(aS) + + /* Build the base/extra-bits table */ + move.l #258,d2 + move.l d2,-(aS) + addq.w #1,d2 + moveq #27,d0 + moveq #2,d4 + jbsr build_base_extrabits + + /* Build the base/extra-bits table */ + move.w #32769,d2 + moveq #29,d0 + moveq #1,d4 + jbsr build_base_extrabits + + /* Initialise the stream */ + moveq #0,d5 /* d5 = stream: fetched data */ + moveq #0,d6 /* d6 = stream: nr fetched bits */ + +1: /* Process a block: Grab the BTYPE|BFINAL 3-bit code */ + moveq #3,d1 + jbsr stream_next_bits + move.l d0,-(aS) + /* Dispatch to the correct decoder for this block */ + lsr.b #1,d0 + move.b dispatch(pc,d0.w),d0 + lea uncompressed_block(pc),a0 + jsr (a0,d0.w) + /* Keep going until we see BFINAL=1 */ + move.l (aS)+,d0 + lsr.b #1,d0 + jcc 1b + + /* Pop the base/extra-bits lookup tables */ + lea (30+29)*4(aS),aS + + movem.l (aS)+,SAVE_RESTORE_REGS + rts + +#if OPT_PREGENERATE_TABLES +pregen_static_huffman: + lea -o_frame(aS),aS /* frame pre-generated; skip over it */ + move.w #256,d4 + jra decode_loop +pregen_dynamic_huffman: + move.l (aS),d0 + lea -3000(aS),aS /* move to dynamic-huffman frame */ + move.l d0,(aS) /* copy o_mode into it */ + jbsr dynamic_huffman + lea 3000(aS),aS + rts + + /* Pre-generate conversion tables for Inflate. */ + /* a6 = Pointer to end of 6000-byte block of memory to contain + * pre-generated tables. All registers preserved. */ +inflate_gentables: + movem.l a5-a6,-(sp) + lea pregen_dummy_block(pc),a5 + jbsr inflate /* static block */ + lea -3000(aS),aS + lea pregen_dummy_block(pc),a5 + jbsr inflate /* dynamic block */ + movem.l (sp)+,a5-a6 + rts + + /* Inflate, using pre-generated tables. */ + /* a4 = output, a5 = input, all regs preserved + * a6 = *end* of 6000-byte pre-generated storage area */ +inflate_fromtables: + movem.l SAVE_RESTORE_REGS,-(aS) + + /* Skip the pre-generated base/extra-bits lookup tables */ + lea -(30+29)*4(aS),aS + + /* Initialise the stream */ + moveq #0,d5 /* d5 = stream: fetched data */ + moveq #0,d6 /* d6 = stream: nr fetched bits */ + +1: /* Process a block: Grab the BTYPE|BFINAL 3-bit code */ + moveq #3,d1 + jbsr stream_next_bits + move.l d0,-(aS) + /* Dispatch to the correct decoder for this block */ + and.b #0xfe,d0 + move.w pregen_dispatch(pc,d0.w),d0 + lea uncompressed_block(pc),a0 + jsr (a0,d0.w) + /* Keep going until we see BFINAL=1 */ + move.l (aS)+,d0 + lsr.b #1,d0 + jcc 1b + + /* Pop the base/extra-bits lookup tables */ + lea (30+29)*4(aS),aS + + movem.l (aS)+,SAVE_RESTORE_REGS + rts + +pregen_dispatch: + dc.w uncompressed_block - uncompressed_block + dc.w pregen_static_huffman - uncompressed_block + dc.w pregen_dynamic_huffman - uncompressed_block +pregen_dummy_block: /* A single static block containing EOB symbol only */ + dc.b 0x03,0x00 +#endif /* OPT_PREGENERATE_TABLES */ + +#undef o_hdist +#undef o_hlit +#undef o_lens +#undef o_codelen_tree +#undef o_litlen_tree +#undef o_dist_tree +#undef o_frame diff --git a/cputest/main.c b/cputest/main.c index 0b314305..17ff3f4c 100644 --- a/cputest/main.c +++ b/cputest/main.c @@ -27,6 +27,8 @@ typedef signed char uae_s8; #include "cputest_defines.h" +extern void callinflate(uae_u8*, uae_u8*); + struct fpureg { uae_u16 exp; @@ -411,23 +413,124 @@ static void end_test(void) touser(enable_data); } -static uae_u8 *load_file(const char *path, const char *file, uae_u8 *p, int *sizep, int exiterror) +static int readdata(uae_u8 *p, int size, FILE *f, uae_u8 *unpack, int *offsetp) +{ + if (!unpack) { + return fread(p, 1, size, f); + } else { + memcpy(p, unpack + (*offsetp), size); + (*offsetp) += size; + return size; + } +} +static void seekdata(int seek, FILE *f, uae_u8 *unpack, int *offsetp) +{ + if (!unpack) { + fseek(f, seek, SEEK_CUR); + } else { + (*offsetp) += seek; + } +} + +static uae_u8 *parse_gzip(uae_u8 *gzbuf, int *sizep) +{ + uae_u8 *end = gzbuf + (*sizep); + uae_u8 flags = gzbuf[3]; + uae_u16 v; + if (gzbuf[0] != 0x1f && gzbuf[1] != 0x8b) + return NULL; + gzbuf += 10; + if (flags & 2) /* multipart not supported */ + return NULL; + if (flags & 32) /* encryption not supported */ + return NULL; + if (flags & 4) { /* skip extra field */ + v = *gzbuf++; + v |= (*gzbuf++) << 8; + gzbuf += v + 2; + } + if (flags & 8) { /* get original file name */ + while (*gzbuf++); + } + if (flags & 16) { /* skip comment */ + while (*gzbuf++); + } + *sizep = (end[-4] << 0) | (end[-3] << 8) | (end[-2] << 16) | (end[-1] << 24); + return gzbuf; +} + +static uae_u8 *load_file(const char *path, const char *file, uae_u8 *p, int *sizep, int exiterror, int candirect) { char fname[256]; - sprintf(fname, "%s%s", path, file); + uae_u8 *unpack = NULL; + int unpackoffset = 0; + int size = 0; + + sprintf(fname, "%s%s.gz", path, file); FILE *f = fopen(fname, "rb"); - if (!f) { - if (exiterror) { - printf("Couldn't open '%s'\n", fname); + if (f) { + fseek(f, 0, SEEK_END); + int gsize = ftell(f); + fseek(f, 0, SEEK_SET); + uae_u8 *gzbuf = malloc(gsize); + if (!gzbuf) { + printf("Couldn't allocate %ld bytes (packed), file '%s'\n", gsize, fname); exit(0); } - return NULL; + if (fread(gzbuf, 1, gsize, f) != gsize) { + printf("Couldn't read file '%s'\n", fname); + exit(0); + } + fclose(f); + size = gsize; + uae_u8 *gzdata = parse_gzip(gzbuf, &size); + if (!gzdata) { + printf("Couldn't parse gzip file '%s'\n", fname); + exit(0); + } + f = NULL; + if (!p) { + p = calloc(1, size); + if (!p) { + printf("Couldn't allocate %ld bytes, file '%s'\n", size, fname); + exit(0); + } + printf("Decompressing '%s' (%ld -> %ld)\n", fname, gsize, size); + callinflate(p, gzdata); + *sizep = size; + return p; + } else if (candirect) { + printf("Decompressing '%s' (%ld -> %ld)\n", fname, gsize, size); + callinflate(p, gzdata); + *sizep = size; + return p; + } else { + unpack = calloc(1, size); + if (!unpack) { + printf("Couldn't allocate %ld bytes (unpack), file '%s'\n", size, fname); + exit(0); + } + printf("Decompressing '%s' (%ld -> %ld)\n", fname, gsize, size); + callinflate(unpack, gzdata); + *sizep = size; + } } - int size = *sizep; - if (size < 0) { - fseek(f, 0, SEEK_END); - size = ftell(f); - fseek(f, 0, SEEK_SET); + if (!unpack) { + sprintf(fname, "%s%s", path, file); + f = fopen(fname, "rb"); + if (!f) { + if (exiterror) { + printf("Couldn't open '%s'\n", fname); + exit(0); + } + return NULL; + } + size = *sizep; + if (size < 0) { + fseek(f, 0, SEEK_END); + size = ftell(f); + fseek(f, 0, SEEK_SET); + } } if (!p) { p = calloc(1, size); @@ -437,13 +540,13 @@ static uae_u8 *load_file(const char *path, const char *file, uae_u8 *p, int *siz } } if (safe_memory_end < p || safe_memory_start >= p + size) { - *sizep = fread(p, 1, size, f); + *sizep = readdata(p, size, f, unpack, &unpackoffset); } else { if (size > 0 && p < safe_memory_start) { int size2 = safe_memory_start - p; if (size2 > size) size2 = size; - if (fread(p, 1, size2, f) != size2) + if (readdata(p, size2, f, unpack, &unpackoffset) != size2) goto end; p += size2; size -= size2; @@ -454,7 +557,7 @@ static uae_u8 *load_file(const char *path, const char *file, uae_u8 *p, int *siz int size2 = safe_memory_end - p; if (size2 > size) size2 = size; - fseek(f, size2, SEEK_CUR); + seekdata(size2, f, unpack, &unpackoffset); p += size2; size -= size2; } @@ -469,14 +572,14 @@ static uae_u8 *load_file(const char *path, const char *file, uae_u8 *p, int *siz printf("Couldn't allocate safe tmp memory (%ld bytes)\n", size2); exit(0); } - fread(tmp, 1, size2, f); + readdata(tmp, size2, f, unpack, &unpackoffset); if (memcmp(tmp, p, size2)) { printf("Disable write bus error mode and press any key (SPACE=skip,ESC=abort)\n"); int ch = getchar(); if (ch == 27) { exit(0); } else if (ch == 32) { - fseek(f, size2, SEEK_CUR); + seekdata(size2, f, unpack, &unpackoffset); p += size2; size -= size2; } else { @@ -497,17 +600,22 @@ static uae_u8 *load_file(const char *path, const char *file, uae_u8 *p, int *siz } } if (size > 0) { - if (fread(p, 1, size, f) != size) + if (readdata(p, size, f, unpack, &unpackoffset) != size) goto end; } size = *sizep; } if (*sizep != size) { end: - printf("Couldn't read file '%s'\n", fname); + printf("Couldn't read file '%s' (%ld <> %ld)\n", fname, *sizep, size); exit(0); } - fclose(f); + if (f) { + fclose(f); + } + if (unpack) { + free(unpack); + } return p; } @@ -2117,10 +2225,11 @@ static void freestuff(void) #endif } -static uae_u32 read_u32(FILE* f) +static uae_u32 read_u32(uae_u8 *headerfile, int *poffset) { uae_u8 data[4] = { 0 }; - fread(data, 1, 4, f); + memcpy(data, headerfile + (*poffset), 4); + (*poffset) += 4; return gl(data); } @@ -2137,44 +2246,48 @@ static int test_mnemo(const char *path, const char *opcode) errors = 0; quit = 0; - sprintf(tfname, "%s%s/0000.dat", path, opcode); - FILE *f = fopen(tfname, "rb"); - if (!f) { - printf("Couldn't open '%s'\n", tfname); + + sprintf(tfname, "%s/0000.dat", opcode); + size = -1; + uae_u8 *headerfile = load_file(path, tfname, NULL, &size, 1, 1); + if (!headerfile) { exit(0); } - v = read_u32(f); + int headoffset = 0; + v = read_u32(headerfile, &headoffset); if (v != DATA_VERSION) { printf("Invalid test data file (header)\n"); exit(0); } - starttimeid = read_u32(f); - uae_u32 hmem_lmem = read_u32(f); + starttimeid = read_u32(headerfile, &headoffset); + uae_u32 hmem_lmem = read_u32(headerfile, &headoffset); hmem_rom = (uae_s16)(hmem_lmem >> 16); lmem_rom = (uae_s16)(hmem_lmem & 65535); - test_memory_addr = read_u32(f); - test_memory_size = read_u32(f); + test_memory_addr = read_u32(headerfile, &headoffset); + test_memory_size = read_u32(headerfile, &headoffset); test_memory_end = test_memory_addr + test_memory_size; - opcode_memory_addr = read_u32(f); + opcode_memory_addr = read_u32(headerfile, &headoffset); opcode_memory = (uae_u8*)opcode_memory_addr; - uae_u32 lvl_mask = read_u32(f); + uae_u32 lvl_mask = read_u32(headerfile, &headoffset); lvl = (lvl_mask >> 16) & 15; interrupt_mask = (lvl_mask >> 20) & 7; addressing_mask = (lvl_mask & 0x80000000) ? 0xffffffff : 0x00ffffff; sr_undefined_mask = lvl_mask & 0xffff; safe_memory_mode = (lvl_mask >> 23) & 3; - fpu_model = read_u32(f); - test_low_memory_start = read_u32(f); - test_low_memory_end = read_u32(f); - test_high_memory_start = read_u32(f); - test_high_memory_end = read_u32(f); - safe_memory_start = (uae_u8*)read_u32(f); - safe_memory_end = (uae_u8*)read_u32(f); - user_stack_memory = read_u32(f); - super_stack_memory = read_u32(f); - fread(inst_name, 1, sizeof(inst_name) - 1, f); + fpu_model = read_u32(headerfile, &headoffset); + test_low_memory_start = read_u32(headerfile, &headoffset); + test_low_memory_end = read_u32(headerfile, &headoffset); + test_high_memory_start = read_u32(headerfile, &headoffset); + test_high_memory_end = read_u32(headerfile, &headoffset); + safe_memory_start = (uae_u8*)read_u32(headerfile, &headoffset); + safe_memory_end = (uae_u8*)read_u32(headerfile, &headoffset); + user_stack_memory = read_u32(headerfile, &headoffset); + super_stack_memory = read_u32(headerfile, &headoffset); + memcpy(inst_name, headerfile + headoffset, sizeof(inst_name) - 1); inst_name[sizeof(inst_name) - 1] = 0; + free(headerfile); + headerfile = NULL; int lvl2 = cpu_lvl; if (lvl2 == 5 && lvl2 != lvl) @@ -2222,7 +2335,7 @@ static int test_mnemo(const char *path, const char *opcode) } size = test_memory_size; - load_file(path, "tmem.dat", test_memory, &size, 1); + load_file(path, "tmem.dat", test_memory, &size, 1, 0); if (size != test_memory_size) { printf("tmem.dat size mismatch\n"); exit(0); @@ -2246,51 +2359,39 @@ static int test_mnemo(const char *path, const char *opcode) for (;;) { printf("%s. %lu...\n", tfname, testcnt); - sprintf(tfname, "%s%s/%04ld.dat", path, opcode, filecnt); - FILE *f = fopen(tfname, "rb"); - if (!f) + sprintf(tfname, "%s/%04ld.dat", opcode, filecnt); + + test_data_size = -1; + test_data = load_file(path, tfname, NULL, &test_data_size, 0, 1); + if (!test_data) break; - fread(data, 1, 4, f); - if (gl(data) != DATA_VERSION) { + if (gl(test_data) != DATA_VERSION) { printf("Invalid test data file (header)\n"); exit(0); } - fread(data, 1, 4, f); - if (gl(data) != starttimeid) { + if (gl(test_data + 4) != starttimeid) { printf("Test data file header mismatch (old test data file?)\n"); break; } - fseek(f, 0, SEEK_END); - test_data_size = ftell(f); - fseek(f, 16, SEEK_SET); - test_data_size -= 16; - if (test_data_size <= 0) - break; - test_data = calloc(1, test_data_size); - if (!test_data) { - printf("Couldn't allocate memory for '%s', %lu bytes\n", tfname, test_memory_size); - exit(0); - } - if (fread(test_data, 1, test_data_size, f) != test_data_size) { - printf("Couldn't read '%s'\n", fname); - free(test_data); - break; - } - fclose(f); if (test_data[test_data_size - 1] != CT_END_FINISH) { printf("Invalid test data file (footer)\n"); free(test_data); exit(0); } + test_data_size -= 16; + if (test_data_size <= 0) + break; + test_data += 16; process_test(test_data); + test_data -= 16; + + free(test_data); if (errors || quit) { - free(test_data); break; } - free(test_data); filecnt++; } @@ -2437,9 +2538,9 @@ int main(int argc, char *argv[]) sprintf(path + strlen(path), "%lu/", 68000 + (cpu_lvl == 5 ? 6 : cpu_lvl) * 10); low_memory_size = -1; - low_memory_temp = load_file(path, "lmem.dat", NULL, &low_memory_size, 0); + low_memory_temp = load_file(path, "lmem.dat", NULL, &low_memory_size, 0, 1); high_memory_size = -1; - high_memory_temp = load_file(path, "hmem.dat", NULL, &high_memory_size, 0); + high_memory_temp = load_file(path, "hmem.dat", NULL, &high_memory_size, 0, 1); #ifndef M68K if (low_memory_size > 0) diff --git a/cputest/makefile b/cputest/makefile index 341569ba..67b586ad 100644 --- a/cputest/makefile +++ b/cputest/makefile @@ -10,7 +10,8 @@ LINK_CFLAGS = -mcrt=nix13 -lm -s OBJS = main.o asm040.o amiga.o \ decode_ea.o globals.o opcode_handler_cpu.o opcode_handler_fpu.o \ - opcode_handler_mmu.o opcodes_cpu.o opcodes_fpu.o opcodes_mmu.o util.o + opcode_handler_mmu.o opcodes_cpu.o opcodes_fpu.o opcodes_mmu.o util.o \ + inflate.o all: $(OBJS) $(CC) $(LINK_CFLAGS) -o cputest $^ @@ -45,6 +46,9 @@ opcodes_mmu.o: adis/opcodes_mmu.c util.o: adis/util.c $(CC) $(CFLAGS) -I. -c -o $@ adis/util.c +inflate.o: inflate.S + $(CC) $(CFLAGS) -I. -c -o $@ inflate.S + asm040.o: asm040.S $(AS) -m68020 -o $@ asm040.S diff --git a/cputest/makefile.st b/cputest/makefile.st index a23373f5..377f220b 100644 --- a/cputest/makefile.st +++ b/cputest/makefile.st @@ -10,7 +10,8 @@ LINK_CFLAGS = -lm -s OBJS = main.o asm040.o atari.o \ decode_ea.o globals.o opcode_handler_cpu.o opcode_handler_fpu.o \ - opcode_handler_mmu.o opcodes_cpu.o opcodes_fpu.o opcodes_mmu.o util.o + opcode_handler_mmu.o opcodes_cpu.o opcodes_fpu.o opcodes_mmu.o util.o \ + inflate.o all: $(OBJS) $(CC) $(LINK_CFLAGS) -o cputest.ttp $^ @@ -45,6 +46,9 @@ opcodes_mmu.o: adis/opcodes_mmu.c util.o: adis/util.c $(CC) $(CFLAGS) -I. -c -o $@ adis/util.c +inflate.o: inflate.S + $(CC) $(CFLAGS) -I. -c -o $@ inflate.S + asm040.o: asm040.S $(AS) -m68020 -o $@ asm040.S -- 2.47.3