Plan 9 from Bell Labs’s /usr/web/sources/plan9/sys/src/cmd/spell/sprog.c

Copyright © 2021 Plan 9 Foundation.
Distributed under the MIT License.
Download the Plan 9 distribution.


#include <u.h>
#include <libc.h>
#include <bio.h>
#include <ctype.h>
#include "code.h"

/* fig leaves for possibly signed char quantities */
#define ISUPPER(c)	isupper((c)&0xff)
#define ISLOWER(c)	islower((c)&0xff)
#define	ISALPHA(c)	isalpha((c)&0xff)
#define	ISDIGIT(c)	isdigit((c)&0xff)
#define ISVOWEL(c)	voweltab[(c)&0xff]
#define Tolower(c)	(ISUPPER(c)? (c)-'A'+'a': (c))
#define pair(a,b)	(((a)<<8) | (b))
#define DLEV		2
#define DSIZ		40

typedef	long	Bits;
#define	Set(h, f)	((long)(h) & (f))

Bits 	nop(char*, char*, char*, int, int);
Bits 	strip(char*, char*, char*, int, int);
Bits 	ize(char*, char*, char*, int, int);
Bits 	i_to_y(char*, char*, char*, int, int);
Bits 	ily(char*, char*, char*, int, int);
Bits 	subst(char*, char*, char*, int, int);
Bits 	CCe(char*, char*, char*, int, int);
Bits 	tion(char*, char*, char*, int, int);
Bits 	an(char*, char*, char*, int, int);
Bits 	s(char*, char*, char*, int, int);
Bits 	es(char*, char*, char*, int, int);
Bits 	bility(char*, char*, char*, int, int);
Bits 	y_to_e(char*, char*, char*, int, int);
Bits 	VCe(char*, char*, char*, int, int);

Bits 	trypref(char*, char*, int, int);
Bits	tryword(char*, char*, int, int);
Bits 	trysuff(char*, int, int);
Bits	dict(char*, char*);
void	typeprint(Bits);
void	pcomma(char*);

void	ise(void);
int	ordinal(void);
char*	skipv(char*);
int	inun(char*, Bits);
char*	ztos(char*);
void	readdict(char*);

typedef	struct	Ptab	Ptab;
struct	Ptab
{
	char*	s;
	int	flag;
};

typedef	struct	Suftab	Suftab;
struct	Suftab
{
	char	*suf;
	Bits	(*p1)(char*, char*, char*, int, int);
	int	n1;
	char	*d1;
	char	*a1;
	int	flag;
	int	affixable;
	Bits	(*p2)(char*, char*, char*, int, int);
	int	n2;
	char	*d2;
	char	*a2;
};

Suftab	staba[] = {
	{"aibohp",subst,1,"-e+ia","",NOUN, NOUN},
	0
};

Suftab	stabc[] =
{
	{"cai",strip,1,"","+c",N_AFFIX, ADJ|NOUN},
	{"citsi",strip,2,"","+ic",N_AFFIX, ADJ | N_AFFIX | NOUN},
	{"citi",ize,1,"-e+ic","",N_AFFIX, ADJ },
	{"cihparg",i_to_y,1,"-y+ic","",NOUN, ADJ|NOUN },
	{"cipocs",ize,1,"-e+ic","",NOUN, ADJ },
	{"cirtem",i_to_y,1,"-y+ic","",NOUN, ADJ },
	{"cigol",i_to_y,1,"-y+ic","",NOUN, ADJ },
	{"cimono",i_to_y,1,"-y+ic","",NOUN, ADJ },
	{"cibohp",subst,1,"-e+ic","",NOUN, ADJ },
	0
};
Suftab	stabd[] =
{
	{"de",strip,1,"","+d",ED,ADJ |COMP,i_to_y,2,"-y+ied","+ed"},
	{"dooh",ily,4,"-y+ihood","+hood",NOUN | ADV, NOUN},
	0
};
Suftab	stabe[] =
{
	/*
	 * V_affix for comment ->commence->commentment??
	 */
	{"ecna",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
	{"ecne",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
	{"elbaif",i_to_y,4,"-y+iable","",V_IRREG,ADJ},
	{"elba",CCe,4,"-e+able","+able",V_AFFIX,ADJ},
	{"evi",subst,0,"-ion+ive","",N_AFFIX | V_AFFIX,NOUN | N_AFFIX| ADJ},
	{"ezi",CCe,3,"-e+ize","+ize",N_AFFIX|ADJ ,V_AFFIX | VERB |ION | COMP},
	{"ekil",strip,4,"","+like",N_AFFIX ,ADJ},
	0
};
Suftab	stabg[] =
{
	{"gniee",strip,3,"","+ing",V_IRREG ,ADJ|NOUN},
	{"gnikam",strip,6,"","+making",NOUN,NOUN},
	{"gnipeek",strip,7,"","+keeping",NOUN,NOUN},
	{"gni",CCe,3,"-e+ing","+ing",V_IRREG ,ADJ|ED|NOUN},
	0
};
Suftab	stabl[] =
{
	{"ladio",strip,2,"","+al",NOUN |ADJ,ADJ},
	{"laci",strip,2,"","+al",NOUN |ADJ,ADJ |NOUN|N_AFFIX},
	{"latnem",strip,2,"","+al",N_AFFIX,ADJ},
	{"lanoi",strip,2,"","+al",N_AFFIX,ADJ|NOUN},
	{"luf",ily,3,"-y+iful","+ful",N_AFFIX,ADJ | NOUN},
	0
};
Suftab	stabm[] =
{
		/* congregational + ism */
	{"msi",CCe,3,"-e+ism","ism",N_AFFIX|ADJ,NOUN},
	{"margo",subst,-1,"-ph+m","",NOUN,NOUN},
	0
};
Suftab	stabn[] =
{
	{"noitacifi",i_to_y,6,"-y+ication","",ION,NOUN | N_AFFIX},
	{"noitazi",ize,4,"-e+ation","",ION,NOUN| N_AFFIX},
	{"noit",tion,3,"-e+ion","+ion",ION,NOUN| N_AFFIX | V_AFFIX |VERB|ACTOR},
	{"naino",an,3,"","+ian",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
	{"namow",strip,5,"","+woman",MAN,PROP_COLLECT|N_AFFIX},
	{"nam",strip,3,"","+man",MAN,PROP_COLLECT | N_AFFIX | VERB},
	{"na",an,1,"","+n",NOUN|PROP_COLLECT,NOUN | N_AFFIX},
	{"nemow",strip,5,"","+women",MAN,PROP_COLLECT},
	{"nem",strip,3,"","+man",MAN,PROP_COLLECT},
	{"nosrep",strip,6,"","+person",MAN,PROP_COLLECT},
	0
};
Suftab	stabp[] =
{
	{"pihs",strip,4,"","+ship",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
	0
};
Suftab	stabr[] =
{
	{"rehparg",subst,1,"-y+er","",ACTOR,NOUN,strip,2,"","+er"},
	{"reyhparg",nop,0,"","",0,NOUN},
	{"reyl",nop,0,"","",0,NOUN},
	{"rekam",strip,5,"","+maker",NOUN,NOUN},
	{"repeek",strip,6,"","+keeper",NOUN,NOUN},
	{"re",strip,1,"","+r",ACTOR,NOUN | N_AFFIX|VERB|ADJ,	i_to_y,2,"-y+ier","+er"},
	{"rota",tion,2,"-e+or","",ION,NOUN| N_AFFIX|_Y},
	{"rotc",tion,2,"","+or",ION,NOUN| N_AFFIX},
	{"rotp",tion,2,"","+or",ION,NOUN| N_AFFIX},
	0
};
Suftab	stabs[] =
{
	{"ssen",ily,4,"-y+iness","+ness",ADJ|ADV,NOUN| N_AFFIX},
	{"ssel",ily,4,"-y+iless","+less",NOUN | PROP_COLLECT,ADJ },
	{"se",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH ,	es,2,"-y+ies","+es"},
	{"s'",s,2,"","+'s",PROP_COLLECT | NOUN,DONT_TOUCH },
	{"s",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH  },
	0
};
Suftab	stabt[] =
{
	{"tnem",strip,4,"","+ment",V_AFFIX,NOUN | N_AFFIX | ADJ|VERB},
	{"tse",strip,2,"","+st",EST,DONT_TOUCH,	i_to_y,3,"-y+iest","+est" },
	{"tsigol",i_to_y,2,"-y+ist","",N_AFFIX,NOUN | N_AFFIX},
	{"tsi",CCe,3,"-e+ist","+ist",N_AFFIX|ADJ,NOUN | N_AFFIX|COMP},
	0
};
Suftab	staby[] =
{
	{"ycna",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
	{"ycne",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
	{"ytilib",bility,5,"-le+ility","",ADJ | V_AFFIX,NOUN | N_AFFIX},
	{"ytisuo",nop,0,"","",NOUN},
	{"ytilb",nop,0,"","",0,NOUN},
	{"yti",CCe,3,"-e+ity","+ity",ADJ ,NOUN | N_AFFIX },
	{"ylb",y_to_e,1,"-e+y","",ADJ,ADV},
	{"ylc",nop,0,"","",0},
	{"ylelb",nop,0,"","",0},
	{"ylelp",nop,0,"","",0},
	{"yl",ily,2,"-y+ily","+ly",ADJ,ADV|COMP},
	{"yrtem",subst,0,"-er+ry","",NOUN,NOUN | N_AFFIX},
	{"y",CCe,1,"-e+y","+y",_Y,ADJ|COMP},
	0
};
Suftab	stabz[] =
{
	0
};
Suftab*	suftab[] =
{
	staba,
	stabz,
	stabc,
	stabd,
	stabe,
	stabz,
	stabg,
	stabz,
	stabz,
	stabz,
	stabz,
	stabl,
	stabm,
	stabn,
	stabz,
	stabp,
	stabz,
	stabr,
	stabs,
	stabt,
	stabz,
	stabz,
	stabz,
	stabz,
	staby,
	stabz,
};

Ptab	ptaba[] =
{
	"anti", 0,
	"auto", 0,
	0
};
Ptab	ptabb[] =
{
	"bio", 0,
	0
};
Ptab	ptabc[] =
{
	"counter", 0,
	0
};
Ptab	ptabd[] =
{
	"dis", 0,
	0
};
Ptab	ptabe[] =
{
	"electro", 0,
	0
};
Ptab	ptabf[] =
{
	"femto", 0,
	0
};
Ptab	ptabg[] =
{
	"geo", 0,
	"giga", 0,
	0
};
Ptab	ptabh[] =
{
	"hyper", 0,
	0
};
Ptab	ptabi[] =
{
	"immuno", 0,
	"im", IN,
	"intra", 0,
	"inter", 0,
	"in", IN,
	"ir", IN,
	"iso", 0,
	0
};
Ptab	ptabj[] =
{
	0
};
Ptab	ptabk[] =
{
	"kilo", 0,
	0
};
Ptab	ptabl[] =
{
	0
};
Ptab	ptabm[] =
{
	"magneto", 0,
	"mega", 0,
	"meta", 0,
	"micro", 0,
	"mid", 0,
	"milli", 0,
	"mini", 0,
	"mis", 0,
	"mono", 0,
	"multi", 0,
	0
};
Ptab	ptabn[] =
{
	"nano", 0,
	"neuro", 0,
	"non", 0,
	0
};
Ptab	ptabo[] =
{
	"out", 0,
	"over", 0,
	0
};
Ptab	ptabp[] =
{
	"para", 0,
	"photo", 0,
	"pico", 0,
	"poly", 0,
	"pre", 0,
	"pseudo", 0,
	"psycho", 0,
	0
};
Ptab	ptabq[] =
{
	"quasi", 0,
	0
};
Ptab	ptabr[] =
{
	"radio", 0,
	"re", 0,
	0
};
Ptab	ptabs[] =
{
	"semi", 0,
	"stereo", 0,
	"sub", 0,
	"super", 0,
	0
};
Ptab	ptabt[] =
{
	"tele", 0,
	"tera", 0,
	"thermo", 0,
	0
};
Ptab	ptabu[] =
{
	"ultra", 0,
	"under", 0,	/*must precede un*/
	"un", IN,
	0
};
Ptab	ptabv[] =
{
	0
};
Ptab	ptabw[] =
{
	0
};
Ptab	ptabx[] =
{
	0
};
Ptab	ptaby[] =
{
	0
};
Ptab	ptabz[] =
{
	0
};

Ptab*	preftab[] =
{
	ptaba,
	ptabb,
	ptabc,
	ptabd,
	ptabe,
	ptabf,
	ptabg,
	ptabh,
	ptabi,
	ptabj,
	ptabk,
	ptabl,
	ptabm,
	ptabn,
	ptabo,
	ptabp,
	ptabq,
	ptabr,
	ptabs,
	ptabt,
	ptabu,
	ptabv,
	ptabw,
	ptabx,
	ptaby,
	ptabz,
};

typedef struct {
	char *mesg;
	enum { NONE, SUFF, PREF} type;
} Deriv;

int	aflag;
int	cflag;
int	fflag;
int	vflag;
int	xflag;
int 	nflag;
char	word[500];
char*	original;
Deriv	emptyderiv;
Deriv	deriv[DSIZ+3];
char	affix[DSIZ*10];	/* 10 is longest affix message */
int	prefcount;
int 	suffcount;
char*	acmeid;
char	space[300000];	/* must be as large as "words"+"space" in pcode run */
Bits	encode[2048];	/* must be as long as "codes" in pcode run */
int	nencode;
char	voweltab[256];
char*	spacep[128*128+1];	/* pointer to words starting with 'xx' */
Biobuf	bin;
Biobuf	bout;

char*	codefile = "/sys/lib/amspell";
char*	brfile = "/sys/lib/brspell";
char*	Usage = "usage";

void
main(int argc, char *argv[])
{
	char *ep, *cp;
	char *dp;
	int j, i, c;
	int low;
	Bits h;

	Binit(&bin, 0, OREAD);
	Binit(&bout, 1, OWRITE);
	for(i=0; c = "aeiouyAEIOUY"[i]; i++)
		voweltab[c] = 1;
	while(argc > 1) {
		if(argv[1][0] != '-')
			break;
		for(i=1; c = argv[1][i]; i++)
		switch(c) {
		default:
			fprint(2, "usage: spell [-bcCvx] [-f file]\n");
			exits(Usage);

		case 'a':
			aflag++;
			continue;

		case 'b':
			ise();
			if(!fflag)
				codefile = brfile;
			continue;

		case 'C':		/* for "correct" */
			vflag++;
		case 'c':		/* for ocr */
			cflag++;
			continue;

		case 'v':
			vflag++;
			continue;

		case 'x':
			xflag++;
			continue;

		case 'f':
			if(argc <= 2) {
				fprint(2, "spell: -f requires another argument\n");
				exits(Usage);
			}
			argv++;
			argc--;
			codefile = argv[1];
			fflag++;
			goto brk;
		}
	brk:
		argv++;
		argc--;
	}
	readdict(codefile);
	if(argc > 1) {
		fprint(2, "usage: spell [-bcCvx] [-f file]\n");
		exits(Usage);
	}
	if(aflag)
		cflag = vflag = 0;

	for(;;) {
		affix[0] = 0;
		original = Brdline(&bin, '\n');
		if(original == 0)
			exits(0);
		original[Blinelen(&bin)-1] = 0;
		low = 0;

		if(aflag) {
			acmeid = original;
			while(*original != ':')
				if(*original++ == 0)
					exits(0);
			while(*++original != ':')
				if(*original == 0)
					exits(0);
			*original++ = 0;
		}
		for(ep=word,dp=original; j = *dp; ep++,dp++) {
			if(ISLOWER(j))
				low++;
			if(ep >= word+sizeof(word)-1)
				break;
			*ep = j;
		}
		*ep = 0;

		if(ISDIGIT(word[0]) && ordinal())
			continue;

		h = 0;
		if(!low && !(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH)))
			for(cp=original+1,dp=word+1; dp<ep; dp++,cp++)
				*dp = Tolower(*cp);
		if(!h)
		for(;;) {	/* at most twice */
			if(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH))
				break;
			if(h = trysuff(ep,0,ALL|STOP|DONT_TOUCH))
				break;
			if(!ISUPPER(word[0]))
				break;
			cp = original;
			dp = word;
			while(*dp = *cp++) {
					if(!low)
						*dp = Tolower(*dp);
				dp++;
			}
			word[0] = Tolower(word[0]);
		}

		if(cflag) {
			if(!h || Set(h,STOP))
				print("-");
			else if(!vflag)
				print("+");
			else 
				print("%c",'0' + (suffcount>0) +
				   (prefcount>4? 8: 2*prefcount));
		} else if(!h || Set(h,STOP)) {
			if(aflag)
				Bprint(&bout, "%s:%s\n", acmeid, original);
			else
				Bprint(&bout, "%s\n", original);
		} else if(affix[0] != 0 && affix[0] != '.')
			print("%s\t%s\n", affix, original);
	}
	/* not reached */
}

/*	strip exactly one suffix and do
 *	indicated routine(s), which may recursively
 *	strip suffixes
 */
Bits
trysuff(char* ep, int lev, int flag)
{
	Suftab *t;
	char *cp, *sp;
	Bits h = 0;
	int initchar = ep[-1];

	flag &= ~MONO;
	lev += DLEV;
	if(lev < DSIZ) {
		deriv[lev]  = emptyderiv;
		deriv[lev-1] = emptyderiv;
	}
	if(!ISLOWER(initchar))
		return h;
	for(t=suftab[initchar-'a']; sp=t->suf; t++) {
		cp = ep;
		while(*sp)
			if(*--cp != *sp++)
				goto next;
		for(sp=ep-t->n1; --sp >= word && !ISVOWEL(*sp);)
			;
		if(sp < word)
			continue;
		if(!(t->affixable & flag))
			return 0;
		h = (*t->p1)(ep-t->n1, t->d1, t->a1, lev+1, t->flag|STOP);
		if(!h && t->p2!=0) {
			if(lev < DSIZ) {
				deriv[lev] = emptyderiv;
				deriv[lev+1] = emptyderiv;
			}
			h = (*t->p2)(ep-t->n2, t->d2, t->a2, lev, t->flag|STOP);
		}
		break;
	next:;
	}
	return h;
}

Bits
nop(char* ep, char* d, char* a, int lev, int flag)
{
	USED(ep, d, a, lev, flag);
	return 0;
}

Bits
cstrip(char* ep, char* d, char* a, int lev, int flag)
{
	int temp = ep[0];

	if(ISVOWEL(temp) && ISVOWEL(ep[-1])) {
		switch(pair(ep[-1],ep[0])) {
		case pair('a', 'a'):
		case pair('a', 'e'):
		case pair('a', 'i'):
		case pair('e', 'a'):
		case pair('e', 'e'):
		case pair('e', 'i'):
		case pair('i', 'i'):
		case pair('o', 'a'):
			return 0;
		}
	} else
	if(temp==ep[-1]&&temp==ep[-2])
		return 0;
	return strip(ep,d,a,lev,flag);
}

Bits
strip(char* ep, char* d, char* a, int lev, int flag)
{
	Bits h = trypref(ep, a, lev, flag);

	USED(d);
	if(Set(h,MONO) && ISVOWEL(*ep) && ISVOWEL(ep[-2]))
		h = 0;
	if(h)
		return h;
	if(ISVOWEL(*ep) && !ISVOWEL(ep[-1]) && ep[-1]==ep[-2]) {
		h = trypref(ep-1,a,lev,flag|MONO);
		if(h)
			return h;
	}
	return trysuff(ep,lev,flag);
}

Bits
s(char* ep, char* d, char* a, int lev, int flag)
{
	if(lev > DLEV+1)
		return 0;
	if(*ep=='s') {
		switch(ep[-1]) {
		case 'y':
			if(ISVOWEL(ep[-2])||ISUPPER(*word))
				break;	/*says Kennedys*/
		case 'x':
		case 'z':
		case 's':
			return 0;
		case 'h':
			switch(ep[-2]) {
			case 'c':
			case 's':
				return 0;
			}
		}
	}
	return strip(ep,d,a,lev,flag);
}

Bits
an(char* ep, char* d, char* a, int lev, int flag)
{
	USED(d);
	if(!ISUPPER(*word))	/*must be proper name*/
		return 0;
	return trypref(ep,a,lev,flag);
}

Bits
ize(char* ep, char* d, char* a, int lev, int flag)
{
	int temp = ep[-1];
	Bits h;

	USED(a);
	ep[-1] = 'e';
	h = strip(ep,"",d,lev,flag);
	ep[-1] = temp;
	return h;
}

Bits
y_to_e(char* ep, char* d, char* a, int lev, int flag)
{
	Bits h;
	int  temp;

	USED(a);
	switch(ep[-1]) {
	case 'a':
	case 'e':
	case 'i':
		return 0;
	}
	temp = *ep;
	*ep++ = 'e';
	h = strip(ep,"",d,lev,flag);
	ep[-1] = temp;
	return h;
}

Bits
ily(char* ep, char* d, char* a, int lev, int flag)
{
	int temp = ep[0];
	char *cp = ep;

	if(temp==ep[-1]&&temp==ep[-2])		/* sillly */
		return 0;
	if(*--cp=='y' && !ISVOWEL(*--cp))	/* happyly */
		while(cp>word)
			if(ISVOWEL(*--cp))	/* shyness */
				return 0;
	if(ep[-1]=='i')
		return i_to_y(ep,d,a,lev,flag);
	return cstrip(ep,d,a,lev,flag);
}

Bits
bility(char* ep, char* d, char* a, int lev, int flag)
{
	*ep++ = 'l';
	return y_to_e(ep,d,a,lev,flag);
}

Bits
i_to_y(char* ep, char* d, char* a, int lev, int flag)
{
	Bits h;
	int temp;

	if(ISUPPER(*word))
		return 0;
	if((temp=ep[-1])=='i' && !ISVOWEL(ep[-2])) {
		ep[-1] = 'y';
		a = d;
	}
	h = cstrip(ep,"",a,lev,flag);
	ep[-1] = temp;
	return h;
}

Bits
es(char* ep, char* d, char* a, int lev, int flag)
{
	if(lev>DLEV)
		return 0;
	switch(ep[-1]) {
	default:
		return 0;
	case 'i':
		return i_to_y(ep,d,a,lev,flag);
	case 'h':
		switch(ep[-2]) {
		default:
			return 0;
		case 'c':
		case 's':
			break;
		}
	case 's':
	case 'z':
	case 'x':
		return strip(ep,d,a,lev,flag);
	}
}

Bits
subst(char* ep, char* d, char* a, int lev, int flag)
{
	char *u,*t;
	Bits h;

	USED(a);
	if(skipv(skipv(ep-1)) < word)
		return 0;
	for(t=d; *t!='+'; t++)
		continue;
	for(u=ep; *--t!='-';)
		*--u = *t;
	h = strip(ep,"",d,lev,flag);
	while(*++t != '+')
		continue;
	while(*++t)
		*u++ = *t;
	return h;
}

Bits
tion(char* ep, char* d, char* a, int lev, int flag)
{
	switch(ep[-2]) {
	default:
		return trypref(ep,a,lev,flag);
	case 'a':
	case 'e':
	case 'i':
	case 'o':
	case 'u':
		return y_to_e(ep,d,a,lev,flag);
	}
}

/*
 * possible consonant-consonant-e ending
 */
Bits
CCe(char* ep, char* d, char* a, int lev, int flag)
{
	Bits h;

	switch(ep[-1]) {
	case 'l':
		if(ISVOWEL(ep[-2]))
			break;
		switch(ep[-2]) {
		case 'l':
		case 'r':
		case 'w':
			break;
		default:
			return y_to_e(ep,d,a,lev,flag);
		}
		break;
	case 'c':
	case 'g':
		if(*ep == 'a')	/* prevent -able for -eable */
			return 0;
	case 's':
	case 'v':
	case 'z':
		if(ep[-2]==ep[-1])
			break;
		if(ISVOWEL(ep[-2]))
			break;
	case 'u':
		if(h = y_to_e(ep,d,a,lev,flag))
			return h;
		if(!(ep[-2]=='n' && ep[-1]=='g'))
			return 0;
	}
	return VCe(ep,d,a,lev,flag);
}

/*
 * possible consonant-vowel-consonant-e ending
 */
Bits
VCe(char* ep, char* d, char* a, int lev, int flag)
{
	int c;
	Bits h;

	c = ep[-1];
	if(c=='e')
		return 0;
	if(!ISVOWEL(c) && ISVOWEL(ep[-2])) {
		c = *ep;
		*ep++ = 'e';
		h = trypref(ep,d,lev,flag);
		if(!h)
			h = trysuff(ep,lev,flag);
		if(h)
			return h;
		ep--;
		*ep = c;
	}
	return cstrip(ep,d,a,lev,flag);
}

Ptab*
lookuppref(uchar** wp, char* ep)
{
	Ptab *sp;
	uchar *bp,*cp;
	unsigned int initchar = Tolower(**wp);

	if(!ISALPHA(initchar))
		return 0;
	for(sp=preftab[initchar-'a'];sp->s;sp++) {
		bp = *wp;
		for(cp= (uchar*)sp->s;*cp; )
			if(*bp++!=*cp++)
				goto next;
		for(cp=bp;cp<(uchar*)ep;cp++)
			if(ISVOWEL(*cp)) {
				*wp = bp;
				return sp;
			}
	next:;
	}
	return 0;
}

/*	while word is not in dictionary try stripping
 *	prefixes. Fail if no more prefixes.
 */
Bits
trypref(char* ep, char* a, int lev, int flag)
{
	Ptab *tp;
	char *bp, *cp;
	char *pp;
	Bits h;
	char space[20];

	if(lev<DSIZ) {
		deriv[lev].mesg = a;
		deriv[lev].type = *a=='.'? NONE: SUFF;
	}
	if(h = tryword(word,ep,lev,flag)) {
		if(Set(h, flag&~MONO) && (flag&MONO) <= Set(h, MONO))
			return h;
		h = 0;
	}
	bp = word;
	pp = space;
	if(lev<DSIZ) {
		deriv[lev+1].mesg = pp;
		deriv[lev+1].type = 0;
	}
	while(tp=lookuppref((uchar**)&bp,ep)) {
		*pp++ = '+';
		cp = tp->s;
		while(pp<space+sizeof(space) && (*pp = *cp++))
			pp++;
		deriv[lev+1].type += PREF;
		h = tryword(bp,ep,lev+1,flag);
		if(Set(h,NOPREF) ||
		   ((tp->flag&IN) && inun(bp-2,h)==0)) {
			h = 0;
			break;
		}
		if(Set(h,flag&~MONO) && (flag&MONO) <= Set(h, MONO))
			break;
		h = 0;
	}
	if(lev < DSIZ) {
		deriv[lev+1] = emptyderiv;
		deriv[lev+2] = emptyderiv;
	}
	return h;
}

Bits
tryword(char* bp, char* ep, int lev, int flag)
{
	int  j;
	Bits h = 0;
	char duple[3];

	if(ep-bp <= 1)
		return h;
	if(flag&MONO) {
		if(lev<DSIZ) {
			deriv[++lev].mesg = duple;
			deriv[lev].type = SUFF;
		}
		duple[0] = '+';
		duple[1] = *ep;
		duple[2] = 0;
	}
	h = dict(bp, ep);
	if(vflag==0 || h==0)
		return h;
	/*
	 * when derivations are wanted, collect them
	 * for printing
	 */
	j = lev;
	prefcount = suffcount = 0;
	do {
		if(j<DSIZ && deriv[j].type) {
			strcat(affix, deriv[j].mesg);
			if(deriv[j].type == SUFF)
				suffcount++;
			else if(deriv[j].type != NONE)
				prefcount = deriv[j].type/PREF;
		}
	} while(--j > 0);
	return h;
}

int
inun(char* bp, Bits h)
{
	if(*bp == 'u')
		return Set(h, IN) == 0;
	/* *bp == 'i' */
	if(Set(h, IN) == 0)
		return 0;
	switch(bp[2]) {
	case 'r':
		return bp[1] == 'r';
	case 'm':
	case 'p':
		return bp[1] == 'm';
	}
	return bp[1] == 'n';
}

char*
skipv(char *s)
{
	if(s >= word && ISVOWEL(*s))
		s--;
	while(s >= word && !ISVOWEL(*s))
		s--;
	return s;
}

/*
 * crummy way to Britishise
 */
void
ise(void)
{
	Suftab *p;
	int i;

	for(i=0; i<26; i++)
		for(p = suftab[i]; p->suf; p++) {
			p->suf = ztos(p->suf);
			p->d1 = ztos(p->d1);
			p->a1 = ztos(p->a1);
		}
}

char*
ztos(char *as)
{
	char *s, *ds;

	for(s=as; *s; s++)
		if(*s == 'z')
			goto copy;
	return as;

copy:
	ds = strdup(as);
	for(s=ds; *s; s++)
		if(*s == 'z')
			*s = 's';
	return ds;
}

Bits
dict(char* bp, char* ep)
{
	char *cp, *cp1, *w, *wp, *we;
	int n, f;

	w = bp;
	we = ep;
	n = ep-bp;
	if(n <= 1)
		return NOUN;

	f = w[0] & 0x7f;
	f *= 128;
	f += w[1] & 0x7f;
	bp = spacep[f];
	ep = spacep[f+1];

loop:
	if(bp >= ep) {
		if(xflag) 
			fprint(2, "=%.*s\n", utfnlen(w, n), w);
		return 0;
	}
	/*
	 * find the beginning of some word in the middle
	 */
	cp = bp + (ep-bp)/2;

	while(cp > bp && !(*cp & 0x80))
		cp--;
	while(cp > bp && (cp[-1] & 0x80))
		cp--;

	wp = w + 2;	/* skip two letters */
	cp1 = cp + 2;	/* skip affix code */
	for(;;) {
		if(wp >= we) {
			if(*cp1 & 0x80)
				goto found;
			else
				f = 1;
			break;
		}
		if(*cp1 & 0x80) {
			f = -1;
			break;
		}
		f = *cp1++ - *wp++;
		if(f != 0)
			break;
	}

	if(f < 0) {
		while(!(*cp1 & 0x80))
			cp1++;
		bp = cp1;
		goto loop;
	}
	ep = cp;
	goto loop;

found:
	f = ((cp[0] & 0x7) << 8) |
		(cp[1] & 0xff);
	if(xflag) {
		fprint(2, "=%.*s ", utfnlen(w, n), w);
		typeprint(encode[f]);
	}
	return encode[f];
}

void
typeprint(Bits h)
{

	pcomma("");
	if(h & NOUN)
		pcomma("n");
	if(h & PROP_COLLECT)
		pcomma("pc");
	if(h & VERB) {
		if((h & VERB) == VERB)
			pcomma("v");
		else
		if((h & VERB) == V_IRREG)
			pcomma("vi");
		else
		if(h & ED)
			pcomma("ed");
	}
	if(h & ADJ)
		pcomma("a");
	if(h & COMP) {
		if((h & COMP) == ACTOR)
			pcomma("er");
		else
			pcomma("comp");
	}
	if(h & DONT_TOUCH)
		pcomma("d");
	if(h & N_AFFIX)
		pcomma("na");
	if(h & ADV)
		pcomma("adv");
	if(h & ION)
		pcomma("ion");
	if(h & V_AFFIX)
		pcomma("va");
	if(h & MAN)
		pcomma("man");
	if(h & NOPREF)
		pcomma("nopref");
	if(h & MONO)
		pcomma("ms");
	if(h & IN)
		pcomma("in");
	if(h & _Y)
		pcomma("y");
	if(h & STOP)
		pcomma("s");
	fprint(2, "\n");
}

void
pcomma(char *s)
{
	static flag;

	if(*s == 0) {
		flag = 0;
		return;
	}
	if(!flag) {
		fprint(2, "%s", s);
		flag = 1;
	} else
		fprint(2, ",%s", s);
}

/*
 * is the word on of the following
 *	12th	teen
 *	21st	end in 1
 *	23rd	end in 3
 *	77th	default
 * called knowing word[0] is a digit
 */
int
ordinal(void)
{
	char *cp = word;
	static char sp[4];

	while(ISDIGIT(*cp))
		cp++;
	strncpy(sp,cp,3);
	if(ISUPPER(cp[0]) && ISUPPER(cp[1])) {
		sp[0] = Tolower(cp[0]);
		sp[1] = Tolower(cp[1]);
	}
	return 0 == strncmp(sp,
		cp[-2]=='1'? "th":	/* out of bounds if 1 digit */
		*--cp=='1'? "st":	/* harmless */
		*cp=='2'? "nd":
		*cp=='3'? "rd":
		"th", 3);
}

/*
 * read in the dictionary.
 * format is
 * {
 *	short	nencode;
 *	long	encode[nencode];
 *	char	space[*];
 * };
 *
 * the encodings are a table all different
 * affixes.
 * the dictionary proper has 2 bytes
 * that demark and then the rest of the
 * word. the 2 bytes have the following
 *	0x80 0x00	flag
 *	0x78 0x00	count of prefix bytes
 *			common with prev word
 *	0x07 0xff	affix code
 *
 * all ints are big endians in the file.
 */
void
readdict(char *file)
{
	char *s, *is, *lasts, *ls;
	int c, i, sp, p;
	int f;
	long l;

	lasts = 0;
	f = open(file, 0);
	if(f == -1) {
		fprint(2, "cannot open %s\n", file);
		exits("open");
	}
	if(read(f, space, 2) != 2)
		goto bad;
	nencode = ((space[0]&0xff)<<8) | (space[1]&0xff);
	if(read(f, space, 4*nencode) != 4*nencode)
		goto bad;
	s = space;
	for(i=0; i<nencode; i++) {
		l = (long)(s[0] & 0xff) << 24;
		l |= (s[1] & 0xff) << 16;
		l |= (s[2] & 0xff) << 8;
		l |= s[3] & 0xff;
		encode[i] = (Bits)l;
		s += 4;
	}
	l = read(f, space, sizeof(space));
	if(l == sizeof(space))
		goto noroom;
	is = space + (sizeof(space) - l);
	memmove(is, space, l);

	s = space;
	c = *is++ & 0xff;
	sp = -1;
	i = 0;

loop:
	if(s > is)
		goto noroom;
	if(c < 0) {
		close(f);
		while(sp < 128*128)
			spacep[++sp] = s;
		*s = 0x80;		/* fence */
		return;
	}
	p = (c>>3) & 0xf;
	*s++ = c;
	*s++ = *is++ & 0xff;
	if(p <= 0)
		i = (*is++ & 0xff)*128;
	if(p <= 1) {
		if(!(*is & 0x80))
			i = i/128*128 + (*is++ & 0xff);
		if(i <= sp) {
			fprint(2, "the dict isnt sorted or \n");
			fprint(2, "memmove didn't work\n");
			goto bad;
		}
		while(sp < i)
			spacep[++sp] = s-2;
	}
	ls = lasts;
	lasts = s;
	for(p-=2; p>0; p--)
		*s++ = *ls++;
	for(;;) {
		if(is >= space+sizeof(space)) {
			c = -1;
			break;
		}
		c = *is++ & 0xff;
		if(c & 0x80)
			break;
		*s++ = c;
	}
	*s = 0;
	goto loop;

bad:
	fprint(2, "trouble reading %s\n", file);
	exits("read");
noroom:
	fprint(2, "not enough space for dictionary\n");
	exits("space");
}

Bell Labs OSI certified Powered by Plan 9

(Return to Plan 9 Home Page)

Copyright © 2021 Plan 9 Foundation. All Rights Reserved.
Comments to webmaster@9p.io.