/* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ /* All Rights Reserved */ /* THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF */ /* UNIX System Laboratories, Inc. */ /* The copyright notice above does not evidence any */ /* actual or intended publication of such source code. */ #ident "@(#)awk:b.c 2.12" /* THIS FILE IS BASICALLY OBSOLETE. IT CONTAINS THE OLD IMPLEMENTATION * OF AWK'S REGULAR EXPRESSION CODE. FOR THE NEW POSIX-COMPLIANT CODE * (WHICH IS USED IF OLD_REGEXP ISN'T DEFINED), SEE REGEX.C. */ #if OLD_REGEXP #define DEBUG #include "awk.h" #include #include #include "y.tab.h" #include /* SGI NOTE: I believe that the following is safe, since comparisons of * HAT are almost always made against an integer rather than a character. * However, we must insure that the HAT character is not stored in any * char tables, since it exceeds the maximum size of a char. Also, * be aware that NCHARS is now 257. * -jfk, 2-May-1995 */ #define HAT (NCHARS-1) /* matches ^ in regular expr */ #define MAXLIN 256 #define type(v) (v)->nobj #define left(v) (v)->narg[0] #define right(v) (v)->narg[1] #define parent(v) (v)->nnext #define LEAF case CCL: case NCCL: case CHAR: case MCHAR: case DOT: case FINAL: case ALL: #define UNARY case STAR: case PLUS: case QUEST: /* encoding in tree Nodes: leaf (CCL, NCCL, CHAR, MCHAR, DOT, FINAL, ALL): left is index, right contains value or pointer to value unary (STAR, PLUS, QUEST): left is child, right is null binary (CAT, OR): left and right are children parent contains pointer to parent */ uchar chars[MAXLIN]; int setvec[MAXLIN]; int tmpset[MAXLIN]; Node *point[MAXLIN]; int rtok; /* next token in current re */ int rlxval; uchar *rlxstr; uchar *prestr; /* current position in current re */ uchar *lastre; /* origin of last re */ static int setcnt; static int poscnt; uchar *patbeg; int patlen; #include extern eucwidth_t WW; #define WIDTH1 WW._eucw1 #define WIDTH2 WW._eucw2 #define WIDTH3 WW._eucw3 #define WIDTH(c) (ISASCII(c) ? 1 : \ (ISSET2(c) ? WIDTH2 : (ISSET3(c) ? WIDTH3 : WIDTH1))) #define CODESET(c) (ISASCII(c) ? 0 :( ISSET2(c) ? 2 :( ISSET3(c) ? 3 : 1))) #define CURRENTC(p) (ISASCII(*p) ? (*p++) : \ (ISSET2(*p) ? (p += WIDTH2, p[-WIDTH2]) : \ (ISSET3(*p) ? (p += WIDTH3, p[-WIDTH3]) : \ (p += WIDTH1, p[-WIDTH1]) ))) #define NFA 20 /* cache this many dynamic fa's */ fa *fatab[NFA]; int nfatab = 0; /* entries in fatab */ fa *mkdfa(); const char badtype[] = ":1:Unknown type %d in %s"; static const char bigcharclas[] = "Character class too big", bigcharclasid[] = ":2", badre[] = ":3:Syntax error in regular expression %s at %s", nontermcharclas[] = ":4:Nonterminated character class %s"; fa *makedfa(s, anchor) /* returns dfa for reg expr s */ uchar *s; int anchor; { int i, use, nuse; fa *pfa; if (compile_time) /* a constant for sure */ return mkdfa(s, anchor); for (i = 0; i < nfatab; i++) /* is it there already? */ if (fatab[i]->anchor == anchor && strcmp(fatab[i]->restr,s) == 0) { fatab[i]->use++; return fatab[i]; } pfa = mkdfa(s, anchor); if (nfatab < NFA) { /* room for another */ fatab[nfatab] = pfa; fatab[nfatab]->use = 1; nfatab++; return pfa; } use = fatab[0]->use; /* replace least-recently used */ nuse = 0; for (i = 1; i < nfatab; i++) if (fatab[i]->use < use) { use = fatab[i]->use; nuse = i; } freefa(fatab[nuse]); fatab[nuse] = pfa; pfa->use = 1; return pfa; } fa *mkdfa(s, anchor) /* does the real work of making a dfa */ uchar *s; int anchor; /* anchor = 1 for anchored matches, else 0 */ { Node *p, *p1, *reparse(); fa *f; p = reparse(s); p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); /* put ALL STAR in front of reg. exp. */ p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); /* put FINAL after reg. exp. */ poscnt = 0; penter(p1); /* enter parent pointers and leaf indices */ if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL) nospace("makedfa"); f->accept = poscnt-1; /* penter has computed number of positions in re */ cfoll(f, p1); /* set up follow sets */ freetr(p1); if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL) nospace("makedfa"); if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL) nospace("makedfa"); *f->posns[1] = 0; f->initstat = makeinit(f, anchor); f->anchor = anchor; f->restr = tostring(s); return f; } int makeinit(f, anchor) fa *f; int anchor; { register int i, k; f->curstat = 2; f->out[2] = 0; f->reset = 0; k = *(f->re[0].lfollow); xfree(f->posns[2]); if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) nospace("makeinit"); for (i=0; i<=k; i++) { (f->posns[2])[i] = (f->re[0].lfollow)[i]; } if ((f->posns[2])[1] == f->accept) f->out[2] = 1; for (i=0; igototab[2][i] = 0; f->curstat = cgoto(f, 2, HAT, (uchar *)0); if (anchor) { *f->posns[2] = k-1; /* leave out position 0 */ for (i=0; iposns[0])[i] = (f->posns[2])[i]; } f->out[0] = f->out[2]; if (f->curstat != 2) --(*f->posns[f->curstat]); } return f->curstat; } penter(p) /* set up parent pointers and leaf indices */ Node *p; { switch(type(p)) { LEAF left(p) = (Node *) poscnt; point[poscnt++] = p; break; UNARY penter(left(p)); parent(left(p)) = p; break; case CAT: case OR: penter(left(p)); penter(right(p)); parent(left(p)) = p; parent(right(p)) = p; break; default: error(MM_ERROR, badtype, type(p), "penter"); break; } } freetr(p) /* free parse tree */ Node *p; { switch (type(p)) { LEAF xfree(p); break; UNARY freetr(left(p)); xfree(p); break; case CAT: case OR: freetr(left(p)); freetr(right(p)); xfree(p); break; default: error(MM_ERROR, badtype, type(p), "freetr"); break; } } uchar *cclenter(p) register uchar *p; { register int i, c; uchar *op; uchar *q; register int w, j; #define LEFTEDGEP(p) (ISASCII(p[-1]) ? p-1 : \ (ISSET2(p[-WIDTH2]) ? p-WIDTH2 : \ (ISSET3(p[-WIDTH3]) ? p-WIDTH3 : p-WIDTH1 ))) op = p; i = 0; while ((c = *p++) != 0) { if (c == '\\') { if ((c = *p++) == 't') c = '\t'; else if (c == 'n') c = '\n'; else if (c == 'f') c = '\f'; else if (c == 'r') c = '\r'; else if (c == 'b') c = '\b'; else if (c == '\\') c = '\\'; else if (isdigit(c)) { int n = c - '0'; if (isdigit(*p)) { n = 8 * n + *p++ - '0'; if (isdigit(*p)) n = 8 * n + *p++ - '0'; } c = n; } /* else */ /* c = c; */ } else if (c == '-' && i > 0 && chars[i-1] != 0) { if (*p != 0) { q = p-1; q = LEFTEDGEP(q); if (CODESET(*q) != CODESET(*p)) ; /* '-' is discarded */ else if((w = WIDTH(*p)) <= 1) { c = chars[i-1]; while (++c < (int)*p) { if (i >= MAXLIN) overflo(gettxt(bigcharclasid, bigcharclas)); chars[i++] = c; } }else { for(j=0; j= w || q[j] < p[j]) { /* store '-' only if left < right */ if (i >= MAXLIN) overflo(gettxt(bigcharclasid, bigcharclas)); chars[i++] = '-'; } } c = *p++; } } if (i >= MAXLIN-1) overflo(gettxt(bigcharclasid, bigcharclas)); chars[i++] = c; } chars[i++] = '\0'; dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, chars) ); xfree(op); return(tostring(chars)); } nospace(s) uchar *s; { error(MM_ERROR, ":5:Regular expression too big: out of space in %s", s); } overflo(s) uchar *s; { error(MM_ERROR, ":6:Regular expression too big: %s", s); } cfoll(f, v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ fa *f; register Node *v; { register int i; register int *p; switch(type(v)) { LEAF f->re[(int) left(v)].ltype = type(v); f->re[(int) left(v)].lval = (int) right(v); for (i=0; i<=f->accept; i++) setvec[i] = 0; setcnt = 0; follow(v); /* computes setvec and setcnt */ if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL) overflo(gettxt(":7", "Follow set overflow")); f->re[(int) left(v)].lfollow = p; *p = setcnt; for (i = f->accept; i >= 0; i--) if (setvec[i] == 1) *++p = i; break; UNARY cfoll(f,left(v)); break; case CAT: case OR: cfoll(f,left(v)); cfoll(f,right(v)); break; default: error(MM_ERROR, badtype, type(v), "cfoll"); } } first(p) /* collects initially active leaves of p into setvec */ register Node *p; /* returns 0 or 1 depending on whether p matches empty string */ { register int b; switch(type(p)) { LEAF if (setvec[(int) left(p)] != 1) { setvec[(int) left(p)] = 1; setcnt++; } if (type(p) == CCL && (*(uchar *) right(p)) == '\0') return(0); /* empty CCL */ else return(1); case PLUS: if (first(left(p)) == 0) return(0); return(1); case STAR: case QUEST: first(left(p)); return(0); case CAT: if (first(left(p)) == 0 && first(right(p)) == 0) return(0); return(1); case OR: b = first(right(p)); if (first(left(p)) == 0 || b == 0) return(0); return(1); } error(MM_ERROR, badtype, type(p), "first"); return(-1); } follow(v) Node *v; /* collects leaves that can follow v into setvec */ { Node *p; if (type(v) == FINAL) return; p = parent(v); switch (type(p)) { case STAR: case PLUS: first(v); follow(p); return; case OR: case QUEST: follow(p); return; case CAT: if (v == left(p)) { /* v is left child of p */ if (first(right(p)) == 0) { follow(p); return; } } else /* v is right child */ follow(p); return; } } member(c, s) /* is c in s? */ register uchar c, *s; { while (*s) if (c == *s++) return(1); return(0); } memberw(p, s) register uchar *s, *p; { register int i, w; /* SGI BUG FIX: cgoto is called with a null when a HAT is found */ if (p == NULL) return 0; for( ; *s != 0; s += w) { w = WIDTH(*s); for(i=0; i < w && p[i] == s[i]; i++); if(i >= w) return(1); if(s[w] == '-' && s[w+1] != 0) { if (*p == '-') s += (w+1); else if(CODESET(*p) == CODESET(*s) && p[i] > s[i]) { s += (w+1); for(i=0; i < w && p[i] == s[i]; i++); if(i >= w || p[i] < s[i]) return(1); } } } return(0); } match(f, p) register fa *f; register uchar *p; { register int s, ns; s = f->reset?makeinit(f,0):f->initstat; if (f->out[s]) return(1); do { if ((ns=f->gototab[s][*p]) && WIDTH(*p) == 1) s=ns; else s=cgoto(f,s,*p,p); if (f->out[s]) return(1); } while (CURRENTC(p) != 0); return(0); } pmatch(f, p) register fa *f; register uchar *p; { register int s, ns; register uchar *q; int i, k; s = f->reset?makeinit(f,1):f->initstat; patbeg = p; patlen = -1; do { q = p; do { if (f->out[s]) /* final state */ patlen = q-p; if ((ns=f->gototab[s][*q]) && WIDTH(*q) == 1) s=ns; else s=cgoto(f,s,*q,q); if (s==1) /* no transition */ if (patlen >= 0) { patbeg = p; return(1); } else goto nextin; /* no match */ } while (CURRENTC(q) != 0); if (f->out[s]) patlen = q-p-1; /* don't count $ */ if (patlen >= 0) { patbeg = p; return(1); } nextin: s = 2; if (f->reset) { for (i=2; i<=f->curstat; i++) xfree(f->posns[i]); k = *f->posns[0]; if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) nospace("pmatch"); for (i=0; i<=k; i++) (f->posns[2])[i] = (f->posns[0])[i]; f->initstat = f->curstat = 2; f->out[2] = f->out[0]; for (i=0; igototab[2][i] = 0; } } while (CURRENTC(p) != 0); return (0); } nematch(f, p) register fa *f; register uchar *p; { register int s, ns; register uchar *q; int i, k; s = f->reset?makeinit(f,1):f->initstat; patlen = -1; while (*p) { q = p; do { if (f->out[s]) /* final state */ patlen = q-p; if ((ns=f->gototab[s][*q]) && WIDTH(*q) == 1) s=ns; else s=cgoto(f,s,*q,q); if (s==1) /* no transition */ if (patlen > 0) { patbeg = p; return(1); } else goto nnextin; /* no nonempty match */ } while (CURRENTC(q) != 0); if (f->out[s]) patlen = q-p-1; /* don't count $ */ if (patlen > 0 ) { patbeg = p; return(1); } nnextin: s = 2; if (f->reset) { for (i=2; i<=f->curstat; i++) xfree(f->posns[i]); k = *f->posns[0]; if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) nospace("nematch"); for (i=0; i<=k; i++) (f->posns[2])[i] = (f->posns[0])[i]; f->initstat = f->curstat = 2; f->out[2] = f->out[0]; for (i=0; igototab[2][i] = 0; } CURRENTC(p); } return (0); } Node *regexp(), *primary(), *concat(), *alt(), *unary(); Node *reparse(p) uchar *p; { /* parses regular expression pointed to by p */ /* uses relex() to scan regular expression */ Node *np; dprintf( ("Reparse <%s>\n", p) ); lastre = prestr = p; /* prestr points to string to be parsed */ rtok = relex(); if (rtok == '\0') error(MM_ERROR, ":8:Empty regular expression"); np = regexp(); if (rtok == '\0') return(np); else error(MM_ERROR, badre, lastre, prestr); /*NOTREACHED*/ } Node *regexp() { return (alt(concat(primary()))); } Node *primary() { Node *np; switch (rtok) { case CHAR: np = op2(CHAR, NIL, (Node *) rlxval); rtok = relex(); return (unary(np)); case MCHAR: np = op2(MCHAR, (Node *) 0, rlxval); rtok = relex(); return (unary(np)); case ALL: rtok = relex(); return (unary(op2(ALL, NIL, NIL))); case DOT: rtok = relex(); return (unary(op2(DOT, NIL, NIL))); case CCL: np = op2(CCL, NIL, (Node*) cclenter(rlxstr)); rtok = relex(); return (unary(np)); case NCCL: np = op2(NCCL, NIL, (Node *) cclenter(rlxstr)); rtok = relex(); return (unary(np)); case '^': rtok = relex(); return (unary(op2(CHAR, NIL, (Node *) HAT))); case '$': rtok = relex(); return (unary(op2(CHAR, NIL, NIL))); case '(': rtok = relex(); if (rtok == ')') { /* special pleading for () */ rtok = relex(); return unary(op2(CCL, NIL, (Node *) tostring(""))); } np = regexp(); if (rtok == ')') { rtok = relex(); return (unary(np)); } else error(MM_ERROR, badre, lastre, prestr); default: error(MM_ERROR, ":9:Illegal primary in regular expression %s at %s", lastre, prestr); } /*NOTREACHED*/ } Node *concat(np) Node *np; { switch (rtok) { case CHAR: case MCHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': return (concat(op2(CAT, np, primary()))); default: return (np); } } Node *alt(np) Node *np; { if (rtok == OR) { rtok = relex(); return (alt(op2(OR, np, concat(primary())))); } return (np); } Node *unary(np) Node *np; { switch (rtok) { case STAR: rtok = relex(); return (unary(op2(STAR, np, NIL))); case PLUS: rtok = relex(); return (unary(op2(PLUS, np, NIL))); case QUEST: rtok = relex(); return (unary(op2(QUEST, np, NIL))); default: return (np); } } relex() /* lexical analyzer for reparse */ { register int c; uchar cbuf[150]; int clen, cflag; switch (c = *prestr++) { case '|': return OR; case '*': return STAR; case '+': return PLUS; case '?': return QUEST; case '.': return DOT; case '\0': prestr--; return '\0'; case '^': case '$': case '(': case ')': return c; case '\\': if ((c = *prestr++) == 't') c = '\t'; else if (c == 'n') c = '\n'; else if (c == 'f') c = '\f'; else if (c == 'r') c = '\r'; else if (c == 'b') c = '\b'; else if (c == '\\') c = '\\'; else if (isdigit(c)) { int n = c - '0'; if (isdigit(*prestr)) { n = 8 * n + *prestr++ - '0'; if (isdigit(*prestr)) n = 8 * n + *prestr++ - '0'; } c = n; } /* else it's now in c */ rlxval = c; return CHAR; default: if((cflag = WIDTH(c)) > 1) { clen = 0; cbuf[clen++] = c; while(--cflag) cbuf[clen++] = *prestr++; cbuf[clen] = 0; rlxval = (int) tostring(cbuf); return MCHAR; } rlxval = c; return CHAR; case '[': clen = 0; if (*prestr == '^') { cflag = 1; prestr++; } else cflag = 0; for (;;) { if ((c = *prestr++) == '\\') { cbuf[clen++] = '\\'; if ((c = *prestr++) == '\0') error(MM_ERROR, nontermcharclas, lastre); cbuf[clen++] = c; } else if (c == ']') { cbuf[clen] = 0; rlxstr = tostring(cbuf); if (cflag == 0) return CCL; else return NCCL; } else if (c == '\n') { error(MM_ERROR, ":10:Newline in character class %s...", lastre); } else if (c == '\0') { error(MM_ERROR, nontermcharclas, lastre); } else cbuf[clen++] = c; } } } int cgoto(f, s, c, cp) fa *f; int s, c; /* Be careful here; we actually do pass ints * for c to denote metasyntactic characters */ uchar *cp; { register int i, j, k; register int *p, *q; for (i=0; i<=f->accept; i++) setvec[i] = 0; setcnt = 0; /* compute positions of gototab[s,c] into setvec */ p = f->posns[s]; for (i=1; i<=*p; i++) { if ((k = f->re[p[i]].ltype) != FINAL) { if (k == CHAR && c == f->re[p[i]].lval || k == DOT && c != 0 && c != HAT || k == ALL && c != 0 || k == MCHAR && !strncmp(cp, (uchar *) f->re[p[i]].lval, WIDTH(c)) || k == CCL && memberw(cp, (uchar *) f->re[p[i]].lval) || k == NCCL && !memberw(cp, (uchar *) f->re[p[i]].lval) && c != 0 && c != HAT ) { q = f->re[p[i]].lfollow; for (j=1; j<=*q; j++) { if (setvec[q[j]] == 0) { setcnt++; setvec[q[j]] = 1; } } } } } /* determine if setvec is a previous state */ tmpset[0] = setcnt; j = 1; for (i = f->accept; i >= 0; i--) if (setvec[i]) { tmpset[j++] = i; } /* tmpset == previous state? */ for (i=1; i<= f->curstat; i++) { p = f->posns[i]; if ((k = tmpset[0]) != p[0]) goto different; for (j = 1; j <= k; j++) if (tmpset[j] != p[j]) goto different; /* setvec is state i */ f->gototab[s][c] = i; return i; different:; } /* add tmpset to current set of states */ if (f->curstat >= NSTATES-1) { f->curstat = 2; f->reset = 1; for (i=2; iposns[i]); } else ++(f->curstat); for (i=0; igototab[f->curstat][i] = 0; xfree(f->posns[f->curstat]); if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL) nospace("cgoto"); f->posns[f->curstat] = p; f->gototab[s][c] = f->curstat; for (i = 0; i <= setcnt; i++) p[i] = tmpset[i]; if (setvec[f->accept]) f->out[f->curstat] = 1; else f->out[f->curstat] = 0; return f->curstat; } freefa(f) struct fa *f; { register int i; if (f == NULL) return; for (i=0; i<=f->curstat; i++) xfree(f->posns[i]); for (i=0; i<=f->accept; i++) xfree(f->re[i].lfollow); xfree(f->restr); xfree(f); } #endif /* OLD_REGEX */