wmii

git clone git://oldgit.suckless.org/wmii/
Log | Files | Refs | README | LICENSE

regcomp.c (9694B)


      1 #include <setjmp.h>
      2 #include <stdlib.h>
      3 #include "plan9.h"
      4 #include "regexp9.h"
      5 #include "regcomp.h"
      6 
      7 enum {
      8 	FALSE,
      9 	TRUE,
     10 };
     11 
     12 /*
     13  * Parser Information
     14  */
     15 typedef struct Node Node;
     16 struct Node
     17 {
     18 	Reinst*	first;
     19 	Reinst*	last;
     20 };
     21 
     22 #define	NSTACK	20
     23 static	Node	andstack[NSTACK];
     24 static	Node	*andp;
     25 static	int	atorstack[NSTACK];
     26 static	int*	atorp;
     27 static	int	cursubid;		/* id of current subexpression */
     28 static	int	subidstack[NSTACK];	/* parallel to atorstack */
     29 static	int*	subidp;
     30 static	int	lastwasand;	/* Last token was operand */
     31 static	int	nbra;
     32 static	char*	exprp;		/* pointer to next character in source expression */
     33 static	int	lexdone;
     34 static	int	nclass;
     35 static	Reclass*classp;
     36 static	Reinst*	freep;
     37 static	int	errors;
     38 static	Rune	yyrune;		/* last lex'd rune */
     39 static	Reclass*yyclassp;	/* last lex'd class */
     40 
     41 /* predeclared crap */
     42 static	void	operator(int);
     43 static	void	pushand(Reinst*, Reinst*);
     44 static	void	pushator(int);
     45 static	void	evaluntil(int);
     46 static	int	bldcclass(void);
     47 
     48 static jmp_buf regkaboom;
     49 
     50 static	void
     51 rcerror(char *s)
     52 {
     53 	errors++;
     54 	regerror(s);
     55 	longjmp(regkaboom, 1);
     56 }
     57 
     58 static	Reinst*
     59 newinst(int t)
     60 {
     61 	freep->type = t;
     62 	freep->u2.left = 0;
     63 	freep->u1.right = 0;
     64 	return freep++;
     65 }
     66 
     67 static	void
     68 operand(int t)
     69 {
     70 	Reinst *i;
     71 
     72 	if(lastwasand)
     73 		operator(CAT);	/* catenate is implicit */
     74 	i = newinst(t);
     75 
     76 	if(t == CCLASS || t == NCCLASS)
     77 		i->u1.cp = yyclassp;
     78 	if(t == RUNE)
     79 		i->u1.r = yyrune;
     80 
     81 	pushand(i, i);
     82 	lastwasand = TRUE;
     83 }
     84 
     85 static	void
     86 operator(int t)
     87 {
     88 	if(t==RBRA && --nbra<0)
     89 		rcerror("unmatched right paren");
     90 	if(t==LBRA){
     91 		if(++cursubid >= NSUBEXP)
     92 			rcerror ("too many subexpressions");
     93 		nbra++;
     94 		if(lastwasand)
     95 			operator(CAT);
     96 	} else
     97 		evaluntil(t);
     98 	if(t != RBRA)
     99 		pushator(t);
    100 	lastwasand = FALSE;
    101 	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
    102 		lastwasand = TRUE;	/* these look like operands */
    103 }
    104 
    105 static	void
    106 regerr2(char *s, int c)
    107 {
    108 	char buf[100];
    109 	char *cp = buf;
    110 	while(*s)
    111 		*cp++ = *s++;
    112 	*cp++ = c;
    113 	*cp = '\0';
    114 	rcerror(buf);
    115 }
    116 
    117 static	void
    118 cant(char *s)
    119 {
    120 	char buf[100];
    121 	strcpy(buf, "can't happen: ");
    122 	strcat(buf, s);
    123 	rcerror(buf);
    124 }
    125 
    126 static	void
    127 pushand(Reinst *f, Reinst *l)
    128 {
    129 	if(andp >= &andstack[NSTACK])
    130 		cant("operand stack overflow");
    131 	andp->first = f;
    132 	andp->last = l;
    133 	andp++;
    134 }
    135 
    136 static	void
    137 pushator(int t)
    138 {
    139 	if(atorp >= &atorstack[NSTACK])
    140 		cant("operator stack overflow");
    141 	*atorp++ = t;
    142 	*subidp++ = cursubid;
    143 }
    144 
    145 static	Node*
    146 popand(int op)
    147 {
    148 	Reinst *inst;
    149 
    150 	if(andp <= &andstack[0]){
    151 		regerr2("missing operand for ", op);
    152 		inst = newinst(NOP);
    153 		pushand(inst,inst);
    154 	}
    155 	return --andp;
    156 }
    157 
    158 static	int
    159 popator(void)
    160 {
    161 	if(atorp <= &atorstack[0])
    162 		cant("operator stack underflow");
    163 	--subidp;
    164 	return *--atorp;
    165 }
    166 
    167 static	void
    168 evaluntil(int pri)
    169 {
    170 	Node *op1, *op2;
    171 	Reinst *inst1, *inst2;
    172 
    173 	while(pri==RBRA || atorp[-1]>=pri){
    174 		switch(popator()){
    175 		default:
    176 			rcerror("unknown operator in evaluntil");
    177 			break;
    178 		case LBRA:		/* must have been RBRA */
    179 			op1 = popand('(');
    180 			inst2 = newinst(RBRA);
    181 			inst2->u1.subid = *subidp;
    182 			op1->last->u2.next = inst2;
    183 			inst1 = newinst(LBRA);
    184 			inst1->u1.subid = *subidp;
    185 			inst1->u2.next = op1->first;
    186 			pushand(inst1, inst2);
    187 			return;
    188 		case OR:
    189 			op2 = popand('|');
    190 			op1 = popand('|');
    191 			inst2 = newinst(NOP);
    192 			op2->last->u2.next = inst2;
    193 			op1->last->u2.next = inst2;
    194 			inst1 = newinst(OR);
    195 			inst1->u1.right = op1->first;
    196 			inst1->u2.left = op2->first;
    197 			pushand(inst1, inst2);
    198 			break;
    199 		case CAT:
    200 			op2 = popand(0);
    201 			op1 = popand(0);
    202 			op1->last->u2.next = op2->first;
    203 			pushand(op1->first, op2->last);
    204 			break;
    205 		case STAR:
    206 			op2 = popand('*');
    207 			inst1 = newinst(OR);
    208 			op2->last->u2.next = inst1;
    209 			inst1->u1.right = op2->first;
    210 			pushand(inst1, inst1);
    211 			break;
    212 		case PLUS:
    213 			op2 = popand('+');
    214 			inst1 = newinst(OR);
    215 			op2->last->u2.next = inst1;
    216 			inst1->u1.right = op2->first;
    217 			pushand(op2->first, inst1);
    218 			break;
    219 		case QUEST:
    220 			op2 = popand('?');
    221 			inst1 = newinst(OR);
    222 			inst2 = newinst(NOP);
    223 			inst1->u2.left = inst2;
    224 			inst1->u1.right = op2->first;
    225 			op2->last->u2.next = inst2;
    226 			pushand(inst1, inst2);
    227 			break;
    228 		}
    229 	}
    230 }
    231 
    232 static	Reprog*
    233 optimize(Reprog *pp)
    234 {
    235 	Reinst *inst, *target;
    236 	int size;
    237 	Reprog *npp;
    238 	Reclass *cl;
    239 	int diff;
    240 
    241 	/*
    242 	 *  get rid of NOOP chains
    243 	 */
    244 	for(inst=pp->firstinst; inst->type!=END; inst++){
    245 		target = inst->u2.next;
    246 		while(target->type == NOP)
    247 			target = target->u2.next;
    248 		inst->u2.next = target;
    249 	}
    250 
    251 	/*
    252 	 *  The original allocation is for an area larger than
    253 	 *  necessary.  Reallocate to the actual space used
    254 	 *  and then relocate the code.
    255 	 */
    256 	size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
    257 	npp = realloc(pp, size);
    258 	if(npp==0 || npp==pp)
    259 		return pp;
    260 	diff = (char *)npp - (char *)pp;
    261 	freep = (Reinst *)((char *)freep + diff);
    262 	for(inst=npp->firstinst; inst<freep; inst++){
    263 		switch(inst->type){
    264 		case OR:
    265 		case STAR:
    266 		case PLUS:
    267 		case QUEST:
    268 			*(char **)&inst->u1.right += diff;
    269 			break;
    270 		case CCLASS:
    271 		case NCCLASS:
    272 			*(char **)&inst->u1.right += diff;
    273 			cl = inst->u1.cp;
    274 			*(char **)&cl->end += diff;
    275 			break;
    276 		}
    277 		*(char **)&inst->u2.left += diff;
    278 	}
    279 	*(char **)&npp->startinst += diff;
    280 	return npp;
    281 }
    282 
    283 #ifdef	DEBUG
    284 static	void
    285 dumpstack(void){
    286 	Node *stk;
    287 	int *ip;
    288 
    289 	print("operators\n");
    290 	for(ip=atorstack; ip<atorp; ip++)
    291 		print("0%o\n", *ip);
    292 	print("operands\n");
    293 	for(stk=andstack; stk<andp; stk++)
    294 		print("0%o\t0%o\n", stk->first->type, stk->last->type);
    295 }
    296 
    297 static	void
    298 dump(Reprog *pp)
    299 {
    300 	Reinst *l;
    301 	Rune *p;
    302 
    303 	l = pp->firstinst;
    304 	do{
    305 		print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
    306 			l->u2.left-pp->firstinst, l->u1.right-pp->firstinst);
    307 		if(l->type == RUNE)
    308 			print("\t%C\n", l->u1.r);
    309 		else if(l->type == CCLASS || l->type == NCCLASS){
    310 			print("\t[");
    311 			if(l->type == NCCLASS)
    312 				print("^");
    313 			for(p = l->u1.cp->spans; p < l->u1.cp->end; p += 2)
    314 				if(p[0] == p[1])
    315 					print("%C", p[0]);
    316 				else
    317 					print("%C-%C", p[0], p[1]);
    318 			print("]\n");
    319 		} else
    320 			print("\n");
    321 	}while(l++->type);
    322 }
    323 #endif
    324 
    325 static	Reclass*
    326 newclass(void)
    327 {
    328 	if(nclass >= NCLASS)
    329 		regerr2("too many character classes; limit", NCLASS+'0');
    330 	return &(classp[nclass++]);
    331 }
    332 
    333 static	int
    334 nextc(Rune *rp)
    335 {
    336 	if(lexdone){
    337 		*rp = 0;
    338 		return 1;
    339 	}
    340 	exprp += chartorune(rp, exprp);
    341 	if(*rp == '\\'){
    342 		exprp += chartorune(rp, exprp);
    343 		return 1;
    344 	}
    345 	if(*rp == 0)
    346 		lexdone = 1;
    347 	return 0;
    348 }
    349 
    350 static	int
    351 lex(int literal, int dot_type)
    352 {
    353 	int quoted;
    354 
    355 	quoted = nextc(&yyrune);
    356 	if(literal || quoted){
    357 		if(yyrune == 0)
    358 			return END;
    359 		return RUNE;
    360 	}
    361 
    362 	switch(yyrune){
    363 	case 0:
    364 		return END;
    365 	case '*':
    366 		return STAR;
    367 	case '?':
    368 		return QUEST;
    369 	case '+':
    370 		return PLUS;
    371 	case '|':
    372 		return OR;
    373 	case '.':
    374 		return dot_type;
    375 	case '(':
    376 		return LBRA;
    377 	case ')':
    378 		return RBRA;
    379 	case '^':
    380 		return BOL;
    381 	case '$':
    382 		return EOL;
    383 	case '[':
    384 		return bldcclass();
    385 	}
    386 	return RUNE;
    387 }
    388 
    389 static int
    390 bldcclass(void)
    391 {
    392 	int type;
    393 	Rune r[NCCRUNE];
    394 	Rune *p, *ep, *np;
    395 	Rune rune;
    396 	int quoted;
    397 
    398 	/* we have already seen the '[' */
    399 	type = CCLASS;
    400 	yyclassp = newclass();
    401 
    402 	/* look ahead for negation */
    403 	/* SPECIAL CASE!!! negated classes don't match \n */
    404 	ep = r;
    405 	quoted = nextc(&rune);
    406 	if(!quoted && rune == '^'){
    407 		type = NCCLASS;
    408 		quoted = nextc(&rune);
    409 		*ep++ = '\n';
    410 		*ep++ = '\n';
    411 	}
    412 
    413 	/* parse class into a set of spans */
    414 	for(; ep<&r[NCCRUNE];){
    415 		if(rune == 0){
    416 			rcerror("malformed '[]'");
    417 			return 0;
    418 		}
    419 		if(!quoted && rune == ']')
    420 			break;
    421 		if(!quoted && rune == '-'){
    422 			if(ep == r){
    423 				rcerror("malformed '[]'");
    424 				return 0;
    425 			}
    426 			quoted = nextc(&rune);
    427 			if((!quoted && rune == ']') || rune == 0){
    428 				rcerror("malformed '[]'");
    429 				return 0;
    430 			}
    431 			*(ep-1) = rune;
    432 		} else {
    433 			*ep++ = rune;
    434 			*ep++ = rune;
    435 		}
    436 		quoted = nextc(&rune);
    437 	}
    438 
    439 	/* sort on span start */
    440 	for(p = r; p < ep; p += 2){
    441 		for(np = p; np < ep; np += 2)
    442 			if(*np < *p){
    443 				rune = np[0];
    444 				np[0] = p[0];
    445 				p[0] = rune;
    446 				rune = np[1];
    447 				np[1] = p[1];
    448 				p[1] = rune;
    449 			}
    450 	}
    451 
    452 	/* merge spans */
    453 	np = yyclassp->spans;
    454 	p = r;
    455 	if(r == ep)
    456 		yyclassp->end = np;
    457 	else {
    458 		np[0] = *p++;
    459 		np[1] = *p++;
    460 		for(; p < ep; p += 2)
    461 			if(p[0] <= np[1]){
    462 				if(p[1] > np[1])
    463 					np[1] = p[1];
    464 			} else {
    465 				np += 2;
    466 				np[0] = p[0];
    467 				np[1] = p[1];
    468 			}
    469 		yyclassp->end = np+2;
    470 	}
    471 
    472 	return type;
    473 }
    474 
    475 static	Reprog*
    476 regcomp1(char *s, int literal, int dot_type)
    477 {
    478 	int token;
    479 	Reprog *volatile pp;
    480 
    481 	/* get memory for the program */
    482 	pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
    483 	if(pp == 0){
    484 		regerror("out of memory");
    485 		return 0;
    486 	}
    487 	freep = pp->firstinst;
    488 	classp = pp->class;
    489 	errors = 0;
    490 
    491 	if(setjmp(regkaboom))
    492 		goto out;
    493 
    494 	/* go compile the sucker */
    495 	lexdone = 0;
    496 	exprp = s;
    497 	nclass = 0;
    498 	nbra = 0;
    499 	atorp = atorstack;
    500 	andp = andstack;
    501 	subidp = subidstack;
    502 	lastwasand = FALSE;
    503 	cursubid = 0;
    504 
    505 	/* Start with a low priority operator to prime parser */
    506 	pushator(START-1);
    507 	while((token = lex(literal, dot_type)) != END){
    508 		if((token&0300) == OPERATOR)
    509 			operator(token);
    510 		else
    511 			operand(token);
    512 	}
    513 
    514 	/* Close with a low priority operator */
    515 	evaluntil(START);
    516 
    517 	/* Force END */
    518 	operand(END);
    519 	evaluntil(START);
    520 #ifdef DEBUG
    521 	dumpstack();
    522 #endif
    523 	if(nbra)
    524 		rcerror("unmatched left paren");
    525 	--andp;	/* points to first and only operand */
    526 	pp->startinst = andp->first;
    527 #ifdef DEBUG
    528 	dump(pp);
    529 #endif
    530 	pp = optimize(pp);
    531 #ifdef DEBUG
    532 	print("start: %d\n", andp->first-pp->firstinst);
    533 	dump(pp);
    534 #endif
    535 out:
    536 	if(errors){
    537 		free(pp);
    538 		pp = 0;
    539 	}
    540 	return pp;
    541 }
    542 
    543 extern	Reprog*
    544 regcomp(char *s)
    545 {
    546 	return regcomp1(s, 0, ANY);
    547 }
    548 
    549 extern	Reprog*
    550 regcomplit(char *s)
    551 {
    552 	return regcomp1(s, 1, ANY);
    553 }
    554 
    555 extern	Reprog*
    556 regcompnl(char *s)
    557 {
    558 	return regcomp1(s, 0, ANYNL);
    559 }