/* This source code in this file is licensed to You by Castle Technology
 * Limited ("Castle") and its licensors on contractual terms and conditions
 * ("Licence") which entitle you freely to modify and/or to distribute this
 * source code subject to Your compliance with the terms of the Licence.
 * 
 * This source code has been made available to You without any warranties
 * whatsoever. Consequently, Your use, modification and distribution of this
 * source code is entirely at Your own risk and neither Castle, its licensors
 * nor any other person who has contributed to this source code shall be
 * liable to You for any loss or damage which You may suffer as a result of
 * Your use, modification or distribution of this source code.
 * 
 * Full details of Your rights and obligations are set out in the Licence.
 * You should have received a copy of the Licence with this source code file.
 * If You have not received a copy, the text of the Licence is available
 * online at www.castle-technology.co.uk/riscosbaselicence.htm
 */
/*
 * Title:     squeeze - compression of ADFS executable images
 * Author:    RCC
 * Copyright: (C) 1987, Acorn Computers Ltd, Cambridge, England.
 * Date:      03-Nov-87
 * LastEdit:  28-Mar-88
	      19-Jul-88 just to change the version to 1.00, and date (JSutton)
	      21-Jul-88 remove reference to xpand in help text (JRS)
	      07-Mar-91 add recognition of MOV R0, R0 as well as BLNV $0
 */

#include <stdint.h>
#include <inttypes.h>

#ifdef __STDC__
#  include <string.h>
#  include <stdlib.h>
#else
#  include <strings.h>
/* extern char *malloc(); */
/*#  define  DATE      "Mar 07 1991" */
#endif

#include "VersionNum"

#include <stdio.h>
#include <time.h>
#include <signal.h>
#ifdef __riscos
#include "CLib/kernel.h"
#include "CLib/swis.h"
#else
typedef struct {
   int load, exec;       /* load, exec addresses */
   int start, end;       /* start address/length, end address/attributes */
} _kernel_osfile_block;
#endif

//#include "CLX/err.h"
//#include "CLX/host.h"
//#include "CLX/hash.h"
//#include "CLX/wholefls.h"

#ifndef __riscos
#include <limits.h>
#include <sys/types.h>
#include <sys/stat.h>
#endif

/*
 * squeeze takes an ARM executable image file and compresses it, usually
 * to about half the size, adding decompression code so that the image
 * will automatically expand itself when it is run.
 *
 * For details of the compression scheme, see doc.squeeze.  Briefly,
 * the image is treated as a sequence of 32-bit words, and each word
 * is encoded in one of four ways, specified by a 4-bit nibble:
 *   zero -> nibble 0
 *   the 7*256 most common word values are encoded with one byte extra as
 *   an index into a table
 *   the 7*256 most common upper-3-byte values are encoded with one byte
 *   extra as index into another table, with the low byte separate
 *   anything else is given in full as 4 bytes.
 *
 * The tables of common values are sorted into ascending order
 * and encoded in a devious way.
 */

#define DATE Module_Date
/* If macro value is empty, expression rewrites to "0 * + 1" which is zero. */
#if 0 * Module_MinorVersion_CMHG + 1 == 0
#  define VSN  Module_MajorVersion
#else
#  define VSN  Module_MajorVersion " (" Module_MinorVersion ")"
#endif
#define BRIEF	   "compress an executable ARM-code program"
#define SELF	   "squeeze"
#define HASHSIZE   (8*1024)
#define CHUNKSIZE  (16*1024)

#include "squeeze.h"

#if 1
#include "unsqueeze1.h"
#include "unsqrm1.h"
#else
/*extern void UnSqueeze_FindUnSqueezeCode(int *base, int *limit); */
extern int UnSqueeze_UnSqueezeBase, UnSqueeze_UnSqueezeLimit;
extern int unsqueeze_base, unsqueeze_limit, unsqueeze_end;
#endif

static int verbose;
static int debug;
static int force;

static clock_t lastticks;

/*
 * CLX ickyness
 */

#define err_fail(X...) do { fprintf(stderr, X); fprintf(stderr, "\n"); exit(EXIT_FAILURE); } while(0)
#define err_report(X...) do { fprintf(stderr, X); fprintf(stderr, "\n"); } while (0)
char host_dir_sep_char() { return '/'; }

int wf_load(char *filename, void *ptr, int len)
{
	FILE *f;
	f = fopen(filename, "rb");
	if (f == NULL)
		return -1;
	if (fread(ptr, 1, len, f) != len) {
		fclose(f);
		return -1;
	}
	fclose(f);
	return 0;
}

int wf_save(char *filename, void *ptr, int len)
{
	FILE *f;
	f = fopen(filename, "wb");
	if (f == NULL)
		return -1;
	if (fwrite(ptr, 1, len, f) != len) {
		fclose(f);
		return -1;
	}
	fclose(f);
	return 0;
}

/*
 * Veneer on file-handling.
 */

#define SAVE	  0
#define WRITEINFO 1
#define READINFO  5
#define LOAD	  0xff

#define FILEFOUND  1

static int ticks(void)
{ int last;
  last = lastticks;
  lastticks = (clock() * 100) / CLOCKS_PER_SEC; 
  
  return(lastticks-last);
}

static int fileinfo(_kernel_osfile_block *info, const char *name)
{
#ifdef __riscos
  if (_kernel_osfile(READINFO, name, info) != FILEFOUND)
      return -1;
#else
  struct stat buf;
  int len, ftype;

  if (stat(name, &buf) != 0) return -1;

  len = strlen(name);
  if (len > 4 && name[len-4] == ',')
    ftype = (int)strtoul(name + len - 4, NULL, 16);
  else
    ftype = 0xfff;

  info->load = 0xfff00000 | (ftype << 8);
  info->exec = buf.st_mtime * 100 / 256;
  info->start = buf.st_size;
  info->end = 0;
#endif
  return 0;
}

/*
 * Declarations of nibble values for the encoding.
 */

#define NSHORTS 7
#define NLONGS	(14-NSHORTS)
#define MAXTAB	(7*256)

#define ZERO	0
#define LITERAL 1
#define LONG	2
#define SHORT	(LONG+NLONGS)

/*
 * Data structure declarations.
 */

/*
 * The Info structure is really a 3-word thing, but we are keen to save
 * space, so pack together the pointer to the next element in the list
 * and the count of number of occurrences of this value into a single
 * word.  To get as many bits as possible for the count, we constrain
 * all Info structures to be 8-byte aligned (freeing 3 low bits) and
 * take the top 8 bits of the address off.  This will only work if we
 * are in the low 16MBytes of address space, but it leaves us 11 bits
 * for the count, which is nearly always enough.
 *
 * Later on, we use these same records to build hash tables mapping
 * word values -> index in table of common values, for fast encoding.
 * Fortunately, the tables are of size 7*256 < 2**11, so the same
 * packing dodge still works.  I'm afraid this is all a bit tweaky,
 * but making it fast and small is worth the effort.
 *
 * The alternative is to look up each word by binary search, but that
 * would be slower (I think), viz 10 iterations for each table.
 */


#ifdef __riscos

typedef struct Info {
  word nextandcount;
  word value;
} Info;

#define COUNTBITS 11
#define MAXCOUNT  ((1<<COUNTBITS)-1)

#define unpack(p,n,c)	  { word t = p->nextandcount;		      \
			    n = (Info *)((t>>COUNTBITS)<<3);	      \
			    c = t & MAXCOUNT; }

#define pack(p,n,c)	  { word t = (((word)n)<<(COUNTBITS-3)) | c;  \
			    p->nextandcount = t; }

#define inccount(p,n,c)   { if (c < MAXCOUNT) ++c;		      \
			    p->nextandcount = (((word)n)<<(COUNTBITS-3)) | c; }
#else /* !__riscos */

typedef struct Info {
  struct Info *next;
  word count;
  word value;
} Info;

#define MAXCOUNT  INT_MAX

#define unpack(p,n,c)	  { n = p->next;		      \
			    c = p->count; }

#define pack(p,n,c)	  { p->next = n;  \
			    p->count = c; }

#define inccount(p,n,c)	  { p->next = n;  \
			    p->count = ++c; }

#endif /* !__riscos */

typedef Info *HashTab[HASHSIZE];

typedef struct VTable { /* sorted vector of common values */
  int nhits;	    /* sum of frequencies of words in this table */
  Info *misses;     /* list of (freq, word) pairs not in this table */
  int size;	    /* number of els in the table */
  word els[MAXTAB]; /* table els: [0..size-1] are valid */
} VTable;

typedef struct Freqs { /* list of (value, frequency) pairs */
  int nzeros;	    /* no of zero words */
  int maxfreq;	    /* max frequency in list - useful for sorting */
  Info *freqs;	    /* list of (value, frequency) */
} Freqs;

/*
 * Some ugly stuff to allocate 2-word Info structures efficiently with
 * correct (8-byte) alignment.
 */

typedef struct InfoChunk {
  struct InfoChunk *next;
  Info *free;
  Info *limit;
  Info chunks[(CHUNKSIZE-12)/sizeof(Info)];
} InfoChunk;

static Info *align(Info *p, int n)
{ intptr_t x = (intptr_t)p;
  x += (n - 1); x -= (x % n); return (Info *)x;
}

#if DEBUGGING
static char *heaplwm;
static char *heaphwm;
#endif

static void *xalloc(int bytes, char *what)
{ void *p = malloc(bytes);
  if (p == NULL) err_fail("no more room");
/*  if ((int)p < 0) err_fail("storage corruption (%d)", (int)p); */
#if DEBUGGING
  if (debug) fprintf(stderr, "-- alloc(%d, %s) -> &%x\n", bytes, what, (int)p);
  if ((char *)p + bytes > heaphwm) heaphwm = (char *)p + bytes;
  if ((char *)p < heaplwm) heaplwm = (char *)p;
#else
  what = NULL;
#endif
  return(p);
}

static void xfree(void *p)
{
#if DEBUGGING
  if (debug) fprintf(stderr, "-- free(&%x)\n", (int)p);
#endif
  free(p);
}

static InfoChunk *curchunk;

static void freechunks(void)
{ InfoChunk *p, *next;
  for (p = curchunk; p != NULL; p = next) { next = p->next; xfree(p); }
  curchunk = NULL;
}

static Info *newinfo(Info *next, word v)
{ InfoChunk *chunk;
  Info *p;

  chunk = curchunk;
  if ((chunk == NULL) || ((p = chunk->free) >= chunk->limit)) {
    chunk = (InfoChunk *)xalloc(CHUNKSIZE, "InfoChunk");
    chunk->next  = curchunk;
    chunk->free  = p = align(chunk->chunks, 8);
    chunk->limit = (Info *)(((intptr_t)chunk) + CHUNKSIZE - sizeof(Info));
    curchunk = chunk;
  }
  chunk->free = (p + 1);
  pack(p, next, 1); p->value = v;
  return(p);
}

static void countwords(word *code, word *limit, Freqs *ans)
/*
 * Counts number of occurrences of each word value in the specified block
 * of code [code, limit), and returns list of (value, freqency) pairs.
 */
{ HashTab *hash;
  Info **list;
  Info *p, *next, *freqs;
  int j, nzeros, maxfreq;
  word w;

  hash = xalloc(sizeof(HashTab), "HashTab");
  for (j = 0; j < HASHSIZE; ++j) (*hash)[j] = NULL;
  nzeros = 0;
  while (code < limit) {
/*    fprintf(stderr, "code %p limit %p\n", code, limit); */
    w = *code++;
    if (w == 0) { ++nzeros; continue; }
    j = (w + (w >> 11) + (w >> 22)) % HASHSIZE; /* simple hash function */
    list = &((*hash)[j]);
    p = *list;
    while (1) {
	if (p == NULL) { /*fprintf(stderr, "newinfo\n");*/ *list = newinfo(*list, w); /*fprintf(stderr, "newinfo out\n");*/ break; }
      unpack(p, next, j);
      if (w == p->value) { inccount(p, next, j); break; }
      p = next;
    }
  } /* while code < limit */
/*fprintf(stderr, "done countwords\n");*/
  /*
   * Now flatten the hash table into a single list, and free the vector.
   */
  freqs = NULL; maxfreq = 0;
  for (j = 0; j < HASHSIZE; ++j) {
    for (p = (*hash)[j]; p != NULL; p = next) {
      unpack(p, next, w); pack(p, freqs, w); freqs = p;
      if (w > maxfreq) maxfreq = w; /* keep track of max frequency */
    }
  }
  ans->nzeros  = nzeros;
  ans->maxfreq = maxfreq;
  ans->freqs   = freqs;
  xfree(hash);
}

static int comparewords(const void *a, const void *b)
/*
 * This proc is passed to the library qsort for sorting table elements.
 * We know that all table elements are distinct, so can cheat a little.
 */
{ word x = *(word *)a;
  word y = *(word *)b;
  if (x > y) return(+1);
  return(-1);
}

static void maketable(Freqs *freqs, int maxsize, int wantmisses, VTable *tab)
/*
 * Builds a VTable of the most common values in the list freqs,
 * taking at most maxsize of them, destroying the freqs list
 * in the process, and leaving the remnants hung off the VTable
 * record.
 */
{ Info **withfreq = xalloc((freqs->maxfreq+1) * sizeof(Info *), "withfreq");
  Info **list;
  Info *p, *next, *misses;
  int  freq, nhits, size;

  /*
   * It is easy to sort things by frequency, as frequency range is
   * limited to 1..freqs->maxfreq.  So just build a vector of lists.
   */
  for (list = withfreq + freqs->maxfreq; list >= withfreq; *list-- = NULL);

  for (p = freqs->freqs; p != NULL; p = next) { /* put p into bucket */
    unpack(p, next, freq);
    pack(p, (withfreq[freq]), freq);
    withfreq[freq] = p;
  }
  freqs->freqs = NULL;

  nhits  = 0;
  misses = NULL;
  size	 = 0;
  for (list = withfreq + freqs->maxfreq; list >= withfreq; --list) {
    for (p = *list; p != NULL; p = next) {
      unpack(p, next, freq);
      if (size < maxsize) {			   /* add to table */
	tab->els[size++] = p->value; nhits += freq;
      } else {					   /* add to misses list */
	if (!wantmisses) break;
	pack(p, misses, freq); misses = p;
      }
    }
  }
  tab->nhits  = nhits;
  tab->misses = misses;
  tab->size   = size;
  xfree(withfreq);
  qsort((void *)(tab->els), size, sizeof(word), &comparewords);
  if (verbose > 1) printf("-- built table in %d csec\n", ticks());
}

static void masklowbyte(Info *list, Freqs *ans)
/*
 * Take a list of (value, frequency) of 4-byte values, merge values
 * with the same low byte to produce list of 3-byte values.
 */
#define VECBITS 12
#define VECSIZE (1<<VECBITS)
{ Info **vec = xalloc(VECSIZE * sizeof(Info *), "mergevec");
  Info **pp;
  Info *p, *next;
  Info *q, *qnext, *qprev;
  int freq, qfreq, qprevfreq, maxfreq;
  word val, qval;

  for (pp = vec + VECSIZE-1; pp >= vec; *pp-- = NULL);
  for (p = list; p != NULL; p = next) {
    unpack(p, next, freq);
    val = (p->value >> 8); p->value = val;
    pp = vec + ((val + (val >> 9) + (val >> 12)) % VECSIZE);
    /*
     * Now insert p in the ascending-value sorted list pp.
     * This is tricky because of the packing of the nextandcount field,
     * so have to handle start of list specially.
     */
    q = *pp;
    if (q == NULL) { pack(p, NULL, freq); *pp = p; continue; }
    unpack(q, qnext, qfreq); qval = q->value;
    if (val < qval) { pack(p, q, freq); *pp = p; continue; }
    if (val == qval) {
      qfreq += freq; if (qfreq > MAXCOUNT) qfreq = MAXCOUNT;
      pack(q, qnext, qfreq); continue;
    }
    while (1) {
      qprev = q; qprevfreq = qfreq; q = qnext;
      if (q == NULL) {	 /* end of list: add it here */
	pack(p, NULL, freq); pack(qprev, p, qprevfreq); break;
      }
      unpack(q, qnext, qfreq); qval = q->value;
      if (val < qval) {  /* not in list: add it */
	pack(p, q, freq); pack(qprev, p, qprevfreq); break;
      }
      if (val == qval) { /* value matches: add frequency */
	qfreq += freq; if (qfreq > MAXCOUNT) qfreq = MAXCOUNT;
	pack(q, qnext, qfreq); break;
      }
    }
  }
  /*
   * Phew! That should keep the register allocator busy.
   * Now we have a vector of sorted lists: just have to flatten it.
   */
  q = NULL; maxfreq = 0;
  for (pp = vec + VECSIZE-1; pp >= vec; --pp) {
    for (p = *pp; p != NULL; p = next) {
      unpack(p, next, freq); pack(p, q, freq); q = p;
      if (freq > maxfreq) maxfreq = freq;
    }
  }
  ans->maxfreq = maxfreq;
  ans->freqs   = q;
  xfree(vec);
}

#define FASTSIZE 4096
#define FASTHASH(v) ((v + (v >> 7) + (v >> 15)) % FASTSIZE)

static Info **fasttab(VTable *tab)
/*
 * Builds a hash table for translating value -> index in table.
 */
{ Info **vec = xalloc(FASTSIZE * sizeof(Info *), "fasthash");
  Info **pp;
  int idx;
  word val;
  Info *p;

  for (pp = vec + FASTSIZE; pp > vec; *--pp = NULL);

  for (idx = 0; idx < tab->size; ++idx) {
    val = tab->els[idx];
    pp = vec + FASTHASH(val);
    /*
     * Values in table are unique, so just add it to chain.
     */
    p = newinfo(NULL, val); pack(p, *pp, idx); *pp = p;
  }
  return(vec);
}

static int lookup(Info **fast, word val)
{ Info *p, *next;
  int idx;

  for (p = fast[FASTHASH(val)]; p != NULL; p = next) {
    unpack(p, next, idx);
    if (val == p->value) return(idx);
  }
  return(-1);
}

#define TOPBYTE(n) ((n)>>24)
#define LOWBYTE(n) ((n)&0xff)
#define PUTLOWBYTE(p, n) (*p++ = (n)) /* relies on store masking low byte */

#define ENCODEVALUE(w, nibble, p) \
    if (w == 0) {					      \
      nibble = ZERO;					      \
    } else if ((idx = lookup(fshorts, w)) >= 0) {	      \
      PUTLOWBYTE(p, idx);				      \
      nibble = SHORT + (idx >> 8);			      \
    } else if ((idx = lookup(flongs, w>>8)) >= 0) {	      \
      PUTLOWBYTE(p, w); PUTLOWBYTE(p, idx);		      \
      nibble = LONG + (idx >> 8);			      \
    } else {						      \
      *p++ = TOPBYTE(w); w <<= 8; *p++ = TOPBYTE(w); w <<= 8; \
      *p++ = TOPBYTE(w); w <<= 8; *p++ = TOPBYTE(w);	      \
      nibble = LITERAL; 				      \
    }

#define ENCSIZE 8192

/*
 * We encode a pair of words at a time.  To avoid unnecessary copying of data,
 * things are done in a rather curious order, not quite the opposite of the
 * optimal decoding order.  I can't quite convince myself that this is optimal,
 * but I think it is quite good.
 */

static char *encode(word *base, word *limit, Info **fshorts, Info **flongs,
	     SqueezeHeader *h)
/*
 * Returns pointer to byte after the encoded data.
 */
{ word *code;
  word w;
  int idx, nib0, nib1;
  char *buf, *p;

  buf = xalloc(ENCSIZE, "encodebuf"); p = buf;

  h->decodedsize = ((char *)limit - (char *)base);
  for (code = base; code < limit; code += 2) {
    w = code[1]; ENCODEVALUE(w, nib1, p);
    w = code[0]; ENCODEVALUE(w, nib0, p);
    *p++ = (nib0 | (nib1 << 4));
    if (buf != NULL) {
      idx = ((intptr_t)code - (intptr_t)base - 12 - (p - buf));
      if (idx > 0) { h->bytestomove = (p - buf); }
      if ((idx > 256) || (p - buf > ENCSIZE - 16)) {
	/*
	 * Swap from encoding into buf to encoding on top of old stuff
	 * once we get 256 bytes clear, or run out of buf space.
	 */
	memcpy(base+1, buf, p-buf); p = (p-buf) + (char *)(base+1);
	xfree(buf); buf = NULL;
      }
    } else {
      if (p >= (char *)(code-2))
	err_fail("pathological file - can't cope");
    }
  }
  h->encodedsize = p - (char *)(base+1);
  return(p);
}

static char *encodetable(VTable *tab, int threebytes, char *out)
/*
 * Encode the table of 3 or 4 byte values, starting at address p,
 * return pointer to first free byte after table.
 */
{ word *p, *limit;
  word prev, w;
  int delta, ones;

  ones = 0; prev = (word)(-1);
  p = tab->els; limit = p + tab->size;
  while (p < limit) {
    w = *p++; delta = (w - prev); prev = w;
    if (delta == 1) ++ones;
    if ((ones > 0) && ((delta != 1) || (ones >= 9))) {
      *out++ = ones; ones = 0;
    }
    if (delta < 2) {  /* dealt with above: no zeros, ones are peepholed */ }
    else if (delta <= 91-10) { *out++ = delta+10; }
    else if (delta < 82*256) {
      *out++ = (delta>>8)+92; *out++ = LOWBYTE(delta);
    }
    else if (delta < 82*256*256) {
      *out++ = (delta>>16)+174;
      *out++ = LOWBYTE(delta); delta >>= 8;
      *out++ = LOWBYTE(delta);
    }
    else {
      *out++ = 0;
      *out++ = LOWBYTE(delta); delta >>= 8;
      *out++ = LOWBYTE(delta); delta >>= 8;
      *out++ = LOWBYTE(delta);
      if (!threebytes) { delta >>= 8; *out++ = delta; }
    }
  } /* end loop over values in table */
  if (ones > 0) *out++ = ones;
  return(out);
}

static char *writeunsqueeze(char *out, int execoffset)
{ intptr_t base, limit;
  word *p;
  int n, rotr, op;

  /* UnSqueeze_FindUnSqueezeCode(&base, &limit); */
  base = (intptr_t)&UnSqueeze_UnSqueezeBase[6*4];
  limit = (intptr_t)&UnSqueeze_UnSqueezeLimit[0];
  n = limit - base;
  memcpy(out, (void *)base, n); out += n;
  /*
   * Now construct ARM code to jump to exec address, starting with
   * load address in R8.
   */
  op = ADD_R8_R8_0;
  if (execoffset < 0) { op = SUB_R8_R8_0; execoffset = -execoffset; }
  rotr = 32; p = (word *)out;
  while (execoffset > 0) {
    /* Generate ADD/SUB R8, R8, #thing */
    n = LOWBYTE(execoffset); execoffset >>= 8;
    if (n != 0) {
      *p++ = op + (0x100 * ((rotr % 32) / 2)) + n;
    }
    rotr -= 8;
  }
  *p++ = SUB_R1_R8_IMM4;
  *p++ = SWI_XOS_SynchroniseCodeAreas;
  *p++ = MOV_PC_R8;
  return((char *)p);
}

static char *compresscode(word *code, intptr_t size, int execoffset)
/*
 * Returns NULL if no compression, else pointer to top of compressed thing.
 */
{ Freqs  freqs;
  word	 *limit;
  VTable *shorts, *longs;
  Info **fshorts, **flongs;
  int	 nwords, guess, nliterals;
  SqueezeHeader header;
  char *pos, *tabstart;

  size += 7; size &= ~7; /* round up to an even number of words */
  limit = (word *)((char *)code + size);
  countwords(code, limit, &freqs);
  if (verbose > 1) printf("-- counted %" PRIdPTR " bytes in %d csec\n", size, ticks());
  /*
   * Allocate the VTables here to avoid nasty storage interactions, which
   * can lose us some chunks if we're not careful.
   */
  shorts = xalloc(sizeof(VTable), "shorts");
  longs  = xalloc(sizeof(VTable), "longs");
  maketable(&freqs, NSHORTS*256, 1, shorts);
  masklowbyte(shorts->misses, &freqs);
  if (verbose > 1) printf("-- masklowbyte took %d csec\n", ticks());
  maketable(&freqs, NLONGS*256, 0, longs);
  freechunks();
  /*
   * Now guess what the size of the compressed thing would be.
   * The estimates of size of encoded data are exact, but the
   * estimates of encoded table size are a guess, so we over-estimate
   * the size of the decompression code to be on the safe side.
   */
  nwords    = (size / sizeof(word));
  nliterals = nwords - freqs.nzeros - shorts->nhits - longs->nhits;

  guess     =	(nwords / 2)	       /* 0.5 bytes per word of original */
	      + (1 * shorts->nhits)    /* 1 more byte for each short */
	      + (2 * longs->nhits)     /* 2 for each long */
	      + (4 * nliterals)        /* 4 for each literal */
	      + (2 * shorts->size)     /* rough size of shorts table */
	      + (2 * longs->size)      /* rough size of longs table */
	      + 1024;		       /* decompression code + safety margin */

  if (verbose)
	  fprintf(stderr, "-- encoding stats (0,1,2,4) %d%% %d%% %d%% %d%%\n",
	  (freqs.nzeros  * 100) / nwords,
	  (shorts->nhits * 100) / nwords,
	  (longs->nhits  * 100) / nwords,
	  (nliterals	 * 100) / nwords);

  if (guess > (9*size)/10) { /* not much squeeze to be had */
    if ((!force) || (guess > size)) return(NULL);
  }

  /*
   * Now can actually start encoding stuff.
   */
  fshorts = fasttab(shorts);
  flongs  = fasttab(longs);
  if (verbose > 1) fprintf(stderr, "-- built speedup tables in %d csec\n", ticks());
  pos = encode(code, limit, fshorts, flongs, &header);
  xfree(flongs);
  xfree(fshorts);
  freechunks();
  if (verbose > 1)
    fprintf(stderr, "-- encode gives %d in %d csec\n", header.encodedsize, ticks());
  tabstart = pos;
  pos = encodetable(shorts, 0, pos);
  pos = encodetable(longs,  1, pos);
  header.nshorts = shorts->size;
  xfree(shorts);
  header.nlongs  = longs->size;
  xfree(longs);
  /* now word-align before the header words */
  while (((intptr_t)pos & 3) != 0) *pos++ = 0;
  header.encodedtabs = (pos - tabstart);
  if (verbose > 1)
    fprintf(stderr, "-- decode tables occupy %d bytes\n", header.encodedtabs);
  memcpy(pos, &header, sizeof(SqueezeHeader)); pos += sizeof(SqueezeHeader);
  /*
   * Now the branch instruction to be put at the start: this has a word
   * offset, with suitable allowance for ARM prefetch.
   * In fact we want to skip the first 3 instructions of decompression
   * code, as these are executed only for an AIF image.
   */
  *code = B + ((word *)pos - code) + AIFPRELUDE - PREFETCH;
  pos = writeunsqueeze(pos, execoffset);
  pos += sprintf(pos, "rcc %s\n", Module_MajorVersion);
  /* Pad size to be 15 mod 16 */
  while ((((intptr_t)pos - (intptr_t)code) & 0xf) != 0xf) *pos++ = ' ';
  return(pos);
}

static char *compress(word *code, int size, int execoffset)
/*
 * This handles the AIF-specific stuff.
 * Returns NULL if no compression, else pointer to top of compressed thing.
 */
{ char *top;

  if (code[0] != BL+NV && code [0] != MOV_R0_R0)
  { /* not BLNV $+0, not NOOP => not an AIF image */
    return compresscode(code, size, execoffset);
  }
  top = compresscode(code+AIFWORDS, size-AIFBYTES, execoffset-AIFBYTES+4);
  /*
   * Now first word of the stuff we have just compressed is
   * B UnsqueezeADFSImage.  We want the first word of the header to
   * be BL UnsqueezeAIFImage, i.e. destination AIFPRELUDE words earlier.
   */
  code[0] = BL + (code[32] & 0x00ffffff) + AIFWORDS - AIFPRELUDE;
  return(top);
}

#ifdef __riscos
static void arthurise(_kernel_osfile_block *info, int type)
{ int data[2];
  if ((info->load == info->exec) && (info->load == 0x8000)) {
    /* can we use Arthur 'FF8' filetype ? */
      if (_kernel_hostos() == _kernel_ARTHUR) {
      /* This is Arthur - get time of day */
      if (verbose > 1) fprintf(stderr, "-- getting timestamp from Arthur\n");
      data[0] = 3;
      (void) _kernel_osword(14, data);
      info->exec = data[0];
      info->load = 0xfffff800 + (data[1] & 0xff);
      if (type)
          info->load = type + (data[1] & 0xff);
    }
  }
}
#endif

static int squeeze(char *in, char *out)
{ _kernel_osfile_block info;
  int rc, t, squeezed, isdata;
  intptr_t size;
  void *load;
  datahdr *d;
  word *code;
  char *top, *p;
  int type = 0;

  if (verbose) fprintf(stderr, "-- squeezing '%s' to '%s'\n", in, out);
  squeezed = 0; isdata = 0;
  if (fileinfo(&info, in) == -1) err_fail("'%s' does not exist", in);
  size = info.start;
  if ((!force) && (strcmp(in, out) == 0)) {
    /* Check quickly to see if worth loading */
    if (size < 2048) {
      err_report("'%s' is too small to squeeze", in);
      return(0);
    }
    if ((size & 0xf) == 0xf) {
      err_report("'%s' is already squeezed", in);
      return(0);
    }
  }
  if ((info.load & 0xffffff00) == 0xfffffa00) { /* Module */
      int header_size;

      header_size = (intptr_t)&unsqueeze_limit[0] - (intptr_t)&unsqueeze_base[0];
      unsqueeze_end = (void *)size;
      size += header_size;
      code = xalloc(size+DATABYTES+8, "code");
      d = (datahdr *)code; code += DATAWORDS;
      top = ((char *)code) + size+8;
      for (p = top-8; p < top; *p++ = 0); /* Clear the padding space */
      memcpy(code, &unsqueeze_base, header_size);
      load = code + header_size;
  } else {
      code = xalloc(size+DATABYTES+8, "code"); /* Pad to even no of words */
      d = (datahdr *)code; code += DATAWORDS;  /* Space for data header */
      top = ((char *)code) + size+8;
      for (p = top-8; p < top; *p++ = 0); /* Clear the padding space */
      load = code;
  }
  if (wf_load(in, load, info.start) == -1)
      err_fail("can't load '%s'", in);
  if ((info.load & 0xfff00000) == 0xfff00000) {
      type = (info.load & ~0xff);
      if (type == 0xfffffa00)
          type = 0xfffff800;
      info.load = 0x8000;
      info.exec = 0x8000;
  } else {
      if (info.load & 0xfc000000) {
          isdata = 1; d->datamagic = DATAMAGIC;
          d->load = info.load; d->exec = info.exec; d->size = size;
      }
  }
  if (verbose > 1) fprintf(stderr, "-- loaded %" PRIdPTR " bytes in %d csec\n", size, ticks());
  t = clock();

  if (isdata) top = compresscode(code, size, (-DATABYTES) + 4);
  else top = compress(code, size, info.exec - info.load);

  if (top != NULL) {
    t = ((clock() * 100) / CLOCKS_PER_SEC) - t;
    if (isdata) {
      d->bl_decompress = code[0] + DATAWORDS; /* dirty... */
      code -= DATAWORDS;
      info.load = SQUEEZED | (info.load & 0xff);
    } else
      info.exec = info.load;
    rc = (top - (char *)code);
    if (verbose) {
      fprintf(stderr, "-- compressed size %d is %" PRIdPTR "%% of %" PRIdPTR "\n", rc, (rc*100)/size, size);
      fprintf(stderr, "-- compression took %d csec, %" PRIdPTR " bytes/cpusec\n", t, (size*100)/(t?t:1));
    }
    squeezed = 1;
  } else {
    top = (char *)code + size;
    if (verbose) err_report("can't compress '%s', will copy", in);
  }
  if (squeezed || (strcmp(in, out) != 0)) { /* Write it out */
#ifdef __riscos
    arthurise(&info, type);
#endif
    if (wf_save(out, code, top - (char *)code) == -1)
	err_fail("failed to write '%s'", out);
#ifdef __riscos
    _swix(OS_File, _INR(0,2), 18, out, (info.load << 12) >> 20);
#endif
  }
  xfree(d);
#if DEBUGGING
  if (debug) {
    printf("-- heaphwm = &%x = &8000+%d\n",(int)heaphwm,(int)heaphwm-0x8000);
    printf("-- heaplwm = &%x, range = %d\n", (int)heaplwm, heaphwm - heaplwm);
  }
#endif
  return(0);
}

static void help(void)
{ char ch = host_dir_sep_char();
  fprintf(stderr, "\n%s vsn %s [%s] - %s\n", SELF, VSN, DATE, BRIEF);
  fprintf(stderr, "\n%s [options] unsqueezed-file [squeezed-file]\n", SELF);
  fprintf(stderr, "\nOptions:-\n");
  fprintf(stderr, "-f   try harder to squeeze unpleasant things\n");
  fprintf(stderr, "-v   output messages and compression statistics\n");
  fprintf(stderr, "\nExamples:-\n");
  fprintf(stderr, "     %s myprog      %s -v myprog squozen%cmyprog\n", SELF, SELF, ch);
  exit(EXIT_SUCCESS);
}

static void handle_escape(int signo)
{
  signal(signo, handle_escape);
  exit(EXIT_FAILURE);
}


#define err_init(X)
#define host_init()
#define hash_cistrcmp strcmp

static void initialise(void)
{
  signal(SIGINT, handle_escape);
  host_init();
  err_init(SELF);
  debug = 0; force = 0; verbose = 0;
  curchunk = NULL;
  ticks();
}

int main(int argc, char *argv[])
{ int j;
  char *arg;
  char *a = NULL;
  char *b = NULL;

  initialise();

  /* parse help or identify args */
  for (j = 1;  j < argc;  ++j) {
    arg = argv[j];
    if (hash_cistrcmp("-help", arg) == 0 || hash_cistrcmp("-h", arg) == 0) {
      help();
    }
  }

  /* parse args */
  for (j = 1; j < argc; ++j) {
    arg = argv[j];
    if (arg[0] == '-') {
      int i = 1;
      while (arg[i]) {
	switch (arg[i]) {
	  case 'f':
	  case 'F': ++force; break;
	  case 'v':
	  case 'V': ++verbose; break;
#if DEBUGGING
	  case 'z':
	  case 'Z': ++debug; break;
#endif
          case 'o':
              ++j;
              if (argv[j])
                  b = argv[j];
              else
                  err_fail("no filename on -o flag\n");
              break;
	  default : err_fail("flag '%c' not recognised", arg[i]);
	}
	++i;
      }
    } else { /* a filename */
      if      (a == NULL) a = arg;
      else if (b == NULL) b = arg;
      else err_fail("too many filename arguments '%s'\n", arg);
    }
  }
  if (a == NULL) err_fail("missing filename");
  if (b == NULL) b = a; /* squeeze it to itself */

#if DEBUGGING
  if (debug) { heaplwm = (char *)0x03ffffff; heaphwm = NULL; }
#endif

  return(squeeze(a, b));
}