regcomp.c (9694B)
1 #include <setjmp.h> 2 #include <stdlib.h> 3 #include "plan9.h" 4 #include "regexp9.h" 5 #include "regcomp.h" 6 7 enum { 8 FALSE, 9 TRUE, 10 }; 11 12 /* 13 * Parser Information 14 */ 15 typedef struct Node Node; 16 struct Node 17 { 18 Reinst* first; 19 Reinst* last; 20 }; 21 22 #define NSTACK 20 23 static Node andstack[NSTACK]; 24 static Node *andp; 25 static int atorstack[NSTACK]; 26 static int* atorp; 27 static int cursubid; /* id of current subexpression */ 28 static int subidstack[NSTACK]; /* parallel to atorstack */ 29 static int* subidp; 30 static int lastwasand; /* Last token was operand */ 31 static int nbra; 32 static char* exprp; /* pointer to next character in source expression */ 33 static int lexdone; 34 static int nclass; 35 static Reclass*classp; 36 static Reinst* freep; 37 static int errors; 38 static Rune yyrune; /* last lex'd rune */ 39 static Reclass*yyclassp; /* last lex'd class */ 40 41 /* predeclared crap */ 42 static void operator(int); 43 static void pushand(Reinst*, Reinst*); 44 static void pushator(int); 45 static void evaluntil(int); 46 static int bldcclass(void); 47 48 static jmp_buf regkaboom; 49 50 static void 51 rcerror(char *s) 52 { 53 errors++; 54 regerror(s); 55 longjmp(regkaboom, 1); 56 } 57 58 static Reinst* 59 newinst(int t) 60 { 61 freep->type = t; 62 freep->u2.left = 0; 63 freep->u1.right = 0; 64 return freep++; 65 } 66 67 static void 68 operand(int t) 69 { 70 Reinst *i; 71 72 if(lastwasand) 73 operator(CAT); /* catenate is implicit */ 74 i = newinst(t); 75 76 if(t == CCLASS || t == NCCLASS) 77 i->u1.cp = yyclassp; 78 if(t == RUNE) 79 i->u1.r = yyrune; 80 81 pushand(i, i); 82 lastwasand = TRUE; 83 } 84 85 static void 86 operator(int t) 87 { 88 if(t==RBRA && --nbra<0) 89 rcerror("unmatched right paren"); 90 if(t==LBRA){ 91 if(++cursubid >= NSUBEXP) 92 rcerror ("too many subexpressions"); 93 nbra++; 94 if(lastwasand) 95 operator(CAT); 96 } else 97 evaluntil(t); 98 if(t != RBRA) 99 pushator(t); 100 lastwasand = FALSE; 101 if(t==STAR || t==QUEST || t==PLUS || t==RBRA) 102 lastwasand = TRUE; /* these look like operands */ 103 } 104 105 static void 106 regerr2(char *s, int c) 107 { 108 char buf[100]; 109 char *cp = buf; 110 while(*s) 111 *cp++ = *s++; 112 *cp++ = c; 113 *cp = '\0'; 114 rcerror(buf); 115 } 116 117 static void 118 cant(char *s) 119 { 120 char buf[100]; 121 strcpy(buf, "can't happen: "); 122 strcat(buf, s); 123 rcerror(buf); 124 } 125 126 static void 127 pushand(Reinst *f, Reinst *l) 128 { 129 if(andp >= &andstack[NSTACK]) 130 cant("operand stack overflow"); 131 andp->first = f; 132 andp->last = l; 133 andp++; 134 } 135 136 static void 137 pushator(int t) 138 { 139 if(atorp >= &atorstack[NSTACK]) 140 cant("operator stack overflow"); 141 *atorp++ = t; 142 *subidp++ = cursubid; 143 } 144 145 static Node* 146 popand(int op) 147 { 148 Reinst *inst; 149 150 if(andp <= &andstack[0]){ 151 regerr2("missing operand for ", op); 152 inst = newinst(NOP); 153 pushand(inst,inst); 154 } 155 return --andp; 156 } 157 158 static int 159 popator(void) 160 { 161 if(atorp <= &atorstack[0]) 162 cant("operator stack underflow"); 163 --subidp; 164 return *--atorp; 165 } 166 167 static void 168 evaluntil(int pri) 169 { 170 Node *op1, *op2; 171 Reinst *inst1, *inst2; 172 173 while(pri==RBRA || atorp[-1]>=pri){ 174 switch(popator()){ 175 default: 176 rcerror("unknown operator in evaluntil"); 177 break; 178 case LBRA: /* must have been RBRA */ 179 op1 = popand('('); 180 inst2 = newinst(RBRA); 181 inst2->u1.subid = *subidp; 182 op1->last->u2.next = inst2; 183 inst1 = newinst(LBRA); 184 inst1->u1.subid = *subidp; 185 inst1->u2.next = op1->first; 186 pushand(inst1, inst2); 187 return; 188 case OR: 189 op2 = popand('|'); 190 op1 = popand('|'); 191 inst2 = newinst(NOP); 192 op2->last->u2.next = inst2; 193 op1->last->u2.next = inst2; 194 inst1 = newinst(OR); 195 inst1->u1.right = op1->first; 196 inst1->u2.left = op2->first; 197 pushand(inst1, inst2); 198 break; 199 case CAT: 200 op2 = popand(0); 201 op1 = popand(0); 202 op1->last->u2.next = op2->first; 203 pushand(op1->first, op2->last); 204 break; 205 case STAR: 206 op2 = popand('*'); 207 inst1 = newinst(OR); 208 op2->last->u2.next = inst1; 209 inst1->u1.right = op2->first; 210 pushand(inst1, inst1); 211 break; 212 case PLUS: 213 op2 = popand('+'); 214 inst1 = newinst(OR); 215 op2->last->u2.next = inst1; 216 inst1->u1.right = op2->first; 217 pushand(op2->first, inst1); 218 break; 219 case QUEST: 220 op2 = popand('?'); 221 inst1 = newinst(OR); 222 inst2 = newinst(NOP); 223 inst1->u2.left = inst2; 224 inst1->u1.right = op2->first; 225 op2->last->u2.next = inst2; 226 pushand(inst1, inst2); 227 break; 228 } 229 } 230 } 231 232 static Reprog* 233 optimize(Reprog *pp) 234 { 235 Reinst *inst, *target; 236 int size; 237 Reprog *npp; 238 Reclass *cl; 239 int diff; 240 241 /* 242 * get rid of NOOP chains 243 */ 244 for(inst=pp->firstinst; inst->type!=END; inst++){ 245 target = inst->u2.next; 246 while(target->type == NOP) 247 target = target->u2.next; 248 inst->u2.next = target; 249 } 250 251 /* 252 * The original allocation is for an area larger than 253 * necessary. Reallocate to the actual space used 254 * and then relocate the code. 255 */ 256 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst); 257 npp = realloc(pp, size); 258 if(npp==0 || npp==pp) 259 return pp; 260 diff = (char *)npp - (char *)pp; 261 freep = (Reinst *)((char *)freep + diff); 262 for(inst=npp->firstinst; inst<freep; inst++){ 263 switch(inst->type){ 264 case OR: 265 case STAR: 266 case PLUS: 267 case QUEST: 268 *(char **)&inst->u1.right += diff; 269 break; 270 case CCLASS: 271 case NCCLASS: 272 *(char **)&inst->u1.right += diff; 273 cl = inst->u1.cp; 274 *(char **)&cl->end += diff; 275 break; 276 } 277 *(char **)&inst->u2.left += diff; 278 } 279 *(char **)&npp->startinst += diff; 280 return npp; 281 } 282 283 #ifdef DEBUG 284 static void 285 dumpstack(void){ 286 Node *stk; 287 int *ip; 288 289 print("operators\n"); 290 for(ip=atorstack; ip<atorp; ip++) 291 print("0%o\n", *ip); 292 print("operands\n"); 293 for(stk=andstack; stk<andp; stk++) 294 print("0%o\t0%o\n", stk->first->type, stk->last->type); 295 } 296 297 static void 298 dump(Reprog *pp) 299 { 300 Reinst *l; 301 Rune *p; 302 303 l = pp->firstinst; 304 do{ 305 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type, 306 l->u2.left-pp->firstinst, l->u1.right-pp->firstinst); 307 if(l->type == RUNE) 308 print("\t%C\n", l->u1.r); 309 else if(l->type == CCLASS || l->type == NCCLASS){ 310 print("\t["); 311 if(l->type == NCCLASS) 312 print("^"); 313 for(p = l->u1.cp->spans; p < l->u1.cp->end; p += 2) 314 if(p[0] == p[1]) 315 print("%C", p[0]); 316 else 317 print("%C-%C", p[0], p[1]); 318 print("]\n"); 319 } else 320 print("\n"); 321 }while(l++->type); 322 } 323 #endif 324 325 static Reclass* 326 newclass(void) 327 { 328 if(nclass >= NCLASS) 329 regerr2("too many character classes; limit", NCLASS+'0'); 330 return &(classp[nclass++]); 331 } 332 333 static int 334 nextc(Rune *rp) 335 { 336 if(lexdone){ 337 *rp = 0; 338 return 1; 339 } 340 exprp += chartorune(rp, exprp); 341 if(*rp == '\\'){ 342 exprp += chartorune(rp, exprp); 343 return 1; 344 } 345 if(*rp == 0) 346 lexdone = 1; 347 return 0; 348 } 349 350 static int 351 lex(int literal, int dot_type) 352 { 353 int quoted; 354 355 quoted = nextc(&yyrune); 356 if(literal || quoted){ 357 if(yyrune == 0) 358 return END; 359 return RUNE; 360 } 361 362 switch(yyrune){ 363 case 0: 364 return END; 365 case '*': 366 return STAR; 367 case '?': 368 return QUEST; 369 case '+': 370 return PLUS; 371 case '|': 372 return OR; 373 case '.': 374 return dot_type; 375 case '(': 376 return LBRA; 377 case ')': 378 return RBRA; 379 case '^': 380 return BOL; 381 case '$': 382 return EOL; 383 case '[': 384 return bldcclass(); 385 } 386 return RUNE; 387 } 388 389 static int 390 bldcclass(void) 391 { 392 int type; 393 Rune r[NCCRUNE]; 394 Rune *p, *ep, *np; 395 Rune rune; 396 int quoted; 397 398 /* we have already seen the '[' */ 399 type = CCLASS; 400 yyclassp = newclass(); 401 402 /* look ahead for negation */ 403 /* SPECIAL CASE!!! negated classes don't match \n */ 404 ep = r; 405 quoted = nextc(&rune); 406 if(!quoted && rune == '^'){ 407 type = NCCLASS; 408 quoted = nextc(&rune); 409 *ep++ = '\n'; 410 *ep++ = '\n'; 411 } 412 413 /* parse class into a set of spans */ 414 for(; ep<&r[NCCRUNE];){ 415 if(rune == 0){ 416 rcerror("malformed '[]'"); 417 return 0; 418 } 419 if(!quoted && rune == ']') 420 break; 421 if(!quoted && rune == '-'){ 422 if(ep == r){ 423 rcerror("malformed '[]'"); 424 return 0; 425 } 426 quoted = nextc(&rune); 427 if((!quoted && rune == ']') || rune == 0){ 428 rcerror("malformed '[]'"); 429 return 0; 430 } 431 *(ep-1) = rune; 432 } else { 433 *ep++ = rune; 434 *ep++ = rune; 435 } 436 quoted = nextc(&rune); 437 } 438 439 /* sort on span start */ 440 for(p = r; p < ep; p += 2){ 441 for(np = p; np < ep; np += 2) 442 if(*np < *p){ 443 rune = np[0]; 444 np[0] = p[0]; 445 p[0] = rune; 446 rune = np[1]; 447 np[1] = p[1]; 448 p[1] = rune; 449 } 450 } 451 452 /* merge spans */ 453 np = yyclassp->spans; 454 p = r; 455 if(r == ep) 456 yyclassp->end = np; 457 else { 458 np[0] = *p++; 459 np[1] = *p++; 460 for(; p < ep; p += 2) 461 if(p[0] <= np[1]){ 462 if(p[1] > np[1]) 463 np[1] = p[1]; 464 } else { 465 np += 2; 466 np[0] = p[0]; 467 np[1] = p[1]; 468 } 469 yyclassp->end = np+2; 470 } 471 472 return type; 473 } 474 475 static Reprog* 476 regcomp1(char *s, int literal, int dot_type) 477 { 478 int token; 479 Reprog *volatile pp; 480 481 /* get memory for the program */ 482 pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s)); 483 if(pp == 0){ 484 regerror("out of memory"); 485 return 0; 486 } 487 freep = pp->firstinst; 488 classp = pp->class; 489 errors = 0; 490 491 if(setjmp(regkaboom)) 492 goto out; 493 494 /* go compile the sucker */ 495 lexdone = 0; 496 exprp = s; 497 nclass = 0; 498 nbra = 0; 499 atorp = atorstack; 500 andp = andstack; 501 subidp = subidstack; 502 lastwasand = FALSE; 503 cursubid = 0; 504 505 /* Start with a low priority operator to prime parser */ 506 pushator(START-1); 507 while((token = lex(literal, dot_type)) != END){ 508 if((token&0300) == OPERATOR) 509 operator(token); 510 else 511 operand(token); 512 } 513 514 /* Close with a low priority operator */ 515 evaluntil(START); 516 517 /* Force END */ 518 operand(END); 519 evaluntil(START); 520 #ifdef DEBUG 521 dumpstack(); 522 #endif 523 if(nbra) 524 rcerror("unmatched left paren"); 525 --andp; /* points to first and only operand */ 526 pp->startinst = andp->first; 527 #ifdef DEBUG 528 dump(pp); 529 #endif 530 pp = optimize(pp); 531 #ifdef DEBUG 532 print("start: %d\n", andp->first-pp->firstinst); 533 dump(pp); 534 #endif 535 out: 536 if(errors){ 537 free(pp); 538 pp = 0; 539 } 540 return pp; 541 } 542 543 extern Reprog* 544 regcomp(char *s) 545 { 546 return regcomp1(s, 0, ANY); 547 } 548 549 extern Reprog* 550 regcomplit(char *s) 551 { 552 return regcomp1(s, 1, ANY); 553 } 554 555 extern Reprog* 556 regcompnl(char *s) 557 { 558 return regcomp1(s, 0, ANYNL); 559 }