Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

MS-Word reader for UNIX and DOS

17 views
Skip to first unread message

Victor B. Wagner

unread,
Dec 10, 1996, 3:00:00 AM12/10/96
to

Hi, All!

Are you bothered with users who bring or send by E-Mail MS-Word for Windows
files to you, supposing that everyone in the world have this monster?

Here is a small program which reads Word Doc file and prints its content
to stdout.

It does little more than standard strings command - it has size of Word header
hard coded (0xD00, if I'm wrong correct me), tries to recognize correctly
end of text (it displays headers and footers in some funny way after text,
but it is easier to delete them by hand),

tries to interpret Word tables by inserting tabs and eols where neccessary,
wraps long lines and inserts blank line at the end of paragraph.

Also (and it was main reason to write it) it handles MS-Windows cyrillic
code page and converts it to KOI-8 when built under Unix or GOST-Alternate
if built under DOS. If you want to use different code page,
feel free to change windows_table recode array or recode() function for
DOS version.
(It is better to have different arrays for each pair of input/output code
page, but I'm to lazy to write them)

This program is absolutely free and all rights for modification and
redistribution granted.

I hope also, that it doesn't violate any laws, becouse I'm not an owner
of MS-Word and doesn't sign any non-disclosure agreements.

Microsoft is ought to educate their users to not bring their Word files
to somebodys Unix workstation, if they want to avoid such things.

---------- begin catdoc.c ----------------------------------------------

#include<stdio.h>
#include<string.h>
#ifdef unix
/* Windows to KOI-8 recoding table. Note that '\r' is recoded into '\n'*/
unsigned char windows_table[256]={
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0A,0x0E,0x0F,
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
0x80,0x81,0x82,0xAA,0x8F,0x90,0xA9,0x93,0x84,0x92,0x91,0x94,0x83,0x95,0x99,0x8B,
0x98,0x8C,0x9C,0x89,0xBF,0x9A,0x9D,0x8A,0x9E,0xA6,0x87,0xB0,0x8D,0x97,0x86,0xA2,
0xA0,0xA7,0xA5,0x88,0xA4,0x8E,0x96,0x85,0xB3,0xA1,0x9F,0xA8,0xAB,0xAC,0xAD,0xAE,
0xAF,0xB2,0xB1,0xB4,0xB5,0xB6,0xB7,0xB8,0xA3,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0x9B,
0xE1,0xE2,0xF7,0xE7,0xE4,0xE5,0xF6,0xFA,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,0xF0,
0xF2,0xF3,0xF4,0xF5,0xE6,0xE8,0xE3,0xFE,0xFB,0xFD,0xFF,0xF9,0xF8,0xFC,0xE0,0xF1,
0xC1,0xC2,0xD7,0xC7,0xC4,0xC5,0xD6,0xDA,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,0xD0,
0xD2,0xD3,0xD4,0xD5,0xC6,0xC8,0xC3,0xDE,0xDB,0xDD, 'ß', 'Ù', 'Ø', 'Ü', 'À', 'Ñ'};
#define recode(x) (windows_table[x])
#else
/* Windows to DOS cyrillic recode function */
int recode(int x)
{if (x>=240) return x-16;
else
if (x>=192) return x-64;
else
if (x=='\r') return '\n';
return x;
}

#endif
/* function to break lines and handle tables */
void format(int c)
{ static int bufptr=0;
static char buffer[128];
static int table=0;
int i,j;
if (c==7) {if (table) {c='\n';table=0;}
else {table=1;c='\t';}
}
else table=0;
if (c=='\n')
{if (bufptr>0) { buffer[bufptr]=0;printf("%s\n\n",buffer);
bufptr=0;
}
else putc('\n',stdout);
return;
}

if (bufptr&&c==' '&&buffer[bufptr-1]==' ')return;
buffer[bufptr++]=c;
if (bufptr>72)
{ for(i=bufptr-1;i>=0;i--)
if (buffer[i]==' '||buffer[i]=='\t')
{ buffer[i]=0; printf("%s\n",buffer);
for(j=0,i++;i<bufptr;buffer[j++]=buffer[i++]);
bufptr=j;
break;
}
}
return;
}
/* main */
int main(int argc, char **argv)
{FILE *f;int i,c,zero=0;
if (argc<=1) { fprintf(stderr,"Usage:catdoc filename\n"); return 1;}
if (!strcmp(argv[1],"-")) f=stdin;
/* Turbo C 2.0 breaks here with unexpected call to INT 18H (ROM-Basic call)
may be somebody would explain me while */
else f=fopen(argv[1],"rb");

if (f==NULL) { fprintf(stderr,"Error: cannot open %s\n",argv[1]);return 1;}
for (i=0;i<0xD00;i++) fgetc(f);/*skip word header*/
while ((c=fgetc(f))!=EOF)
{ /* I noticed that after end of text in Word file goes some information
with a lot of NULL symbols at the beginning, so I break processing
at two consequintual NULLs */
if (!c) if (zero) break; else zero=1;
else format(recode(c));
}
return 0;
}

---------------------------------------------------------------------------
Phone: 7(095)230-80-61 Victor B. Wagner
Fax: 7(095)230-80-42 Dokuchaev Soil Institute
EMail: vi...@agropc.msk.su Pyzhevsky lane, 7
Moscow,109017 Russia

0 new messages