/*
 *	Extract ASCII Text from M$ Word Document
 *
 *	(c) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
 */

/* FIXME: endianity dependencies! */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

FILE *fi, *fo;

typedef unsigned char byte;
typedef unsigned short word;
typedef unsigned int ulg;

byte
gb(void)
{
  return fgetc(fi);
}

word
gw(void)
{
  word k = fgetc(fi);
  k = k | (fgetc(fi) << 8);
  return k;
}

ulg
gl(void)
{
  ulg l = gw();
  l = l | (gw() << 16);
  return l;
}

void
ole(void)
{
  ulg p, word;
  byte oh[0x80], pn[0x80];
  int z, x, y;

  fseek(fi, 0x30, SEEK_SET);
  p = 0x200 * gl() + 0x200;				/* Position of central directory */
  word = 0;
  printf("Central OLE directory at 0x%x\n", p);
  fseek(fi, p, SEEK_SET);
  while (fread(oh, sizeof(oh), 1, fi) == 1)
	{
	  z = y = 0;
	  if ((!oh[0] && !oh[1]) || (!oh[2] && !oh[3]))
		break;
	  if (oh[z] < 32 && !oh[z+1])
		{
		  z += 2;
		  pn[y++] = '>';
		  pn[y++] = ' ';
		}
	  while (oh[z] || oh[z+1])
		{
		  x = oh[z] | (oh[z+1] << 8);
		  if (x >= 32 && x < 127)
			pn[y++] = x;
		  else
			pn[y++] = '?';
		  z += 2;
		}
	  pn[y] = 0;
	  p = 0x200 * (oh[0x74] | (oh[0x75] << 8) | (oh[0x76] << 16) | (oh[0x77] << 24)) + 0x200;
	  printf("%08x %s\n", p, pn);
	  if (!strcmp(pn, "WordDocument"))
		word = p;
	}
  if (!word)
	{
	  fprintf(stderr, "Word doc stream not found!\n");
	  exit(1);
	}
  fseek(fi, word, SEEK_SET);
}

struct fib {
  word magic;
  word vers;
  word product;
  word lang;
  word pnnext;
  byte flags1;
  byte flags2;
  word back;
  ulg cryptkey;
  byte environ;
  byte rfu;
  word textcharset;						/* 0=win, 256=mac */
  word tablecharset;					/* 0=win, 256=mac */
  ulg firsttextchar;
  ulg xx[6];
  ulg textlen;
  ulg footlen;
  ulg hdrlen;
  ulg macrolen;
  ulg annolen;
  ulg endnotelen;
  ulg textboxlen;
  ulg htextboxlen;
  ulg rfu2;
  ulg stshorigp, stshorigl;
  ulg stshp, stshl;
  ulg footrefp, footrefl;
  ulg foottxtp, foottxtl;
  ulg annorefp, annorefl;
  ulg annotxtp, annotxtl;
  ulg sedp, sedl;
  ulg pardp, pardl;
  ulg pahp, pahl;
  ulg glossp, glossl;
  ulg glosp, glosl;
  ulg hdrp, hdrl;
  ulg chpbp, chpbl;
  ulg papbp, papbl;
  ulg seap, seal;
  ulg ffnsp, ffnsl;
  ulg mainfpp, mainfpl;
  ulg headfpp, headfpl;
  ulg footfpp, footfpl;
  ulg annofpp, annofpl;
  ulg macfpp, macfpl;
  ulg boosp, boosl;
  ulg bookop, bookol;
  ulg booklp, bookll;
  ulg cmdsp, cmdsl;
  ulg mcrpp, mcrpl;
  ulg mcrsp, mcrsl;
  ulg pdrvp, pdrvl;
  ulg prenvpp, prenvpl;
  ulg prenvlp, prenvll;
  ulg wssp, wssl;
  ulg dopp, dopl;
  ulg assosp, assosl;
  ulg compp, compl;						/* Complex file info! */
  ulg footpgp, footpgl;
  ulg orignamep, orignamel;
  ulg annoownp, annoownl;
  ulg annobnp, annobnl;
} __attribute__((packed));

ulg txl, lbf;

int
cc(void)
{
  if (txl)
	{
	  txl--;
	  return gb();
	}
  return -1;
}

char lb[86];
int lbi;

void
flb(void)
{
  lb[lbi++] = '\n';
  lb[lbi] = 0;
  fputs(lb, fo);
  lbi = 0;
}

void
pc(int c)
{
  if (lbi >= 80)
	{
	  int lc = lbi;
	  while (lc > 0 && lb[--lc] != ' ')
		;
	  if (!lc)
		{
		  lbf++;
		  flb();
		}
	  else
		{
		  char exb[80];
		  lb[lbi] = 0;
		  lb[lc] = 0;
		  lbi = lc++;
		  strcpy(exb, lb+lc);
		  flb();
		  strcpy(lb, exb);
		  lbi = strlen(lb);
		}
	}
  lb[lbi++] = c;
}

void
text(void)
{
  int c;

  for(;;)
	switch (c = cc())
	  {
	  case -1:
		flb();
		if (lbf)
		  printf("%d line breaks failed\n", lbf);
		return;
	  case 12:
		flb();
		fputc(12, fo);
		break;
	  case 13:
		lb[lbi++] = '\n';
		/* FALL-THRU */
	  case 11:
		flb();
		break;
	  case 9:
	  case 14:
		pc(9);
		break;
	  case 31:
		pc('-');
		break;
	  case 7:
	  case 19:
	  case 20:
	  case 21:
		break;
	  case 160:
		pc('~');
		break;
	  default:
		pc(c);
	  }
}

void
unword(void)
{
  struct fib fib;
  ulg where = ftell(fi);

printf("%d %d\n", (int)&(((struct fib *)0)->compp), where);
  printf("Reading %d bytes of file header\n", sizeof(fib));
  if (fread(&fib, sizeof(fib), 1, fi) != 1)
	{
	  fprintf(stderr, "FIB read error!\n");
	  exit(1);
	}
  if (fib.magic != 0xa5db && fib.magic != 0xa5dc)
	{
	  fprintf(stderr, "Black magic!\n");
	  exit(1);
	}
  printf("Lang=%d, charset=[%d,%d]\n", fib.lang, fib.textcharset, fib.tablecharset);
  if (fib.flags1 & 4)
	 {
	   if (fib.magic == 0xa5db)
	     puts("Complex format, old magic");
	   else
	     {
		    printf("Complex format, abs start=0x%x, len=%d\n", fib.compp + where, fib.compl);
	       fprintf(stderr, "Fast-saved format not supported yet!\n");
	       exit(1);
		  }
	}
  if (fib.flags2 & 1)
	{
	  fprintf(stderr, "Encrypted files not supported yet!\n");
	  exit(1);
	}
  if (fib.flags2 & 0x1000)
	puts("Extended charsets detected");
  printf("First text char at 0x%x, len=%d\n", fib.firsttextchar + where, fib.textlen);
  fseek(fi, fib.firsttextchar + where, SEEK_SET);
  txl = fib.textlen;
  text();
}

void
convert(void)
{
  word id = gw();
  if (id == 0xa5db || id == 0xa5dc)
	{
	  fseek(fi, 0, SEEK_SET);
	  puts("Plain M$-Word file...");
	}
  else if (id == 0xcfd0)
	{
	  puts("OLE file...");
	  ole();
	}
  else
	{
	  fprintf(stderr, "Unknown file format!\n");
	  exit(1);
	}
  unword();
}

int
main(int argc, char **argv)
{
  if (argc != 3)
	{
	  fprintf(stderr, "Usage: unword <from> <to>\n");
	  return 1;
	}
  if (!(fi = fopen(argv[1], "r")))
	{
	  fprintf(stderr, "Unable to open input file: %m\n");
	  return 1;
	}
  if (!(fo = fopen(argv[2], "w")))
	{
	  fprintf(stderr, "Unable to open output file: %m\n");
	  return 1;
	}
  convert();
  return 0;
}
