2 * Extract ASCII Text from M$ Word Document
4 * (c) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
7 /* FIXME: endianity dependencies! */
15 typedef unsigned char byte;
16 typedef unsigned short word;
17 typedef unsigned int ulg;
29 k = k | (fgetc(fi) << 8);
45 byte oh[0x80], pn[0x80];
48 fseek(fi, 0x30, SEEK_SET);
49 p = 0x200 * gl() + 0x200; /* Position of central directory */
51 printf("Central OLE directory at 0x%x\n", p);
52 fseek(fi, p, SEEK_SET);
53 while (fread(oh, sizeof(oh), 1, fi) == 1)
56 if ((!oh[0] && !oh[1]) || (!oh[2] && !oh[3]))
58 if (oh[z] < 32 && !oh[z+1])
64 while (oh[z] || oh[z+1])
66 x = oh[z] | (oh[z+1] << 8);
67 if (x >= 32 && x < 127)
74 p = 0x200 * (oh[0x74] | (oh[0x75] << 8) | (oh[0x76] << 16) | (oh[0x77] << 24)) + 0x200;
75 printf("%08x %s\n", p, pn);
76 if (!strcmp(pn, "WordDocument"))
81 fprintf(stderr, "Word doc stream not found!\n");
84 fseek(fi, word, SEEK_SET);
99 word textcharset; /* 0=win, 256=mac */
100 word tablecharset; /* 0=win, 256=mac */
112 ulg stshorigp, stshorigl;
114 ulg footrefp, footrefl;
115 ulg foottxtp, foottxtl;
116 ulg annorefp, annorefl;
117 ulg annotxtp, annotxtl;
128 ulg mainfpp, mainfpl;
129 ulg headfpp, headfpl;
130 ulg footfpp, footfpl;
131 ulg annofpp, annofpl;
140 ulg prenvpp, prenvpl;
141 ulg prenvlp, prenvll;
145 ulg compp, compl; /* Complex file info! */
146 ulg footpgp, footpgl;
147 ulg orignamep, orignamel;
148 ulg annoownp, annoownl;
149 ulg annobnp, annobnl;
150 } __attribute__((packed));
183 while (lc > 0 && lb[--lc] != ' ')
216 printf("%d line breaks failed\n", lbf);
252 ulg where = ftell(fi);
254 printf("%d %d\n", (int)&(((struct fib *)0)->compp), where);
255 printf("Reading %d bytes of file header\n", sizeof(fib));
256 if (fread(&fib, sizeof(fib), 1, fi) != 1)
258 fprintf(stderr, "FIB read error!\n");
261 if (fib.magic != 0xa5db && fib.magic != 0xa5dc)
263 fprintf(stderr, "Black magic!\n");
266 printf("Lang=%d, charset=[%d,%d]\n", fib.lang, fib.textcharset, fib.tablecharset);
269 if (fib.magic == 0xa5db)
270 puts("Complex format, old magic");
273 printf("Complex format, abs start=0x%x, len=%d\n", fib.compp + where, fib.compl);
274 fprintf(stderr, "Fast-saved format not supported yet!\n");
280 fprintf(stderr, "Encrypted files not supported yet!\n");
283 if (fib.flags2 & 0x1000)
284 puts("Extended charsets detected");
285 printf("First text char at 0x%x, len=%d\n", fib.firsttextchar + where, fib.textlen);
286 fseek(fi, fib.firsttextchar + where, SEEK_SET);
295 if (id == 0xa5db || id == 0xa5dc)
297 fseek(fi, 0, SEEK_SET);
298 puts("Plain M$-Word file...");
300 else if (id == 0xcfd0)
307 fprintf(stderr, "Unknown file format!\n");
314 main(int argc, char **argv)
318 fprintf(stderr, "Usage: unword <from> <to>\n");
321 if (!(fi = fopen(argv[1], "r")))
323 fprintf(stderr, "Unable to open input file: %m\n");
326 if (!(fo = fopen(argv[2], "w")))
328 fprintf(stderr, "Unable to open output file: %m\n");