#include #include #include "re.h" /****************************************************************** * OUTLINE * * This design first parses the search specification to come up with a * parse tree, and then steps along that until it either finishes or * fails. This has a few good results: first, changes in the RE syntax * are isolated in the parser, and so are relatively easy to make * without touching the trickier parts of the program. * * Second, stepping along the tree is a natural way of doing this. For * example, in an earlier program I had a lot of trouble recognizing * when the search was complete. In the new way, by attaching a * SUCCESS node to the end of the tree, when the SUCCESS node is * reached the search is finished. * * So, the following program is in 3 parts. These are the RE parser, * the traverser, and the reporter. The parser parses the RE to get a * tree, the traverser walks the tree, leaving a stack trail * describing where it goes, and the reporter then deciphers that * trail to provide the RE match. More detail is provided in the * headers for each section. * * ******************************************************************/ /****************************************************************** * TODO 9/24 * - better error handling * - program failure, ie memory allocation * *****************************************************************/ /****************************************************************** * Traversal * * This section takes a parsed RE tree and tries to match it to the * string. It works by simultaneously keeping track of 2 structures: * its location in the RE parse tree, and the frame stack that keeps * track of the state. (at first I thought to do this with recursive * function calls, but there were 2 problems: first, I worried about * the overhead and size in evaluating complex regexps, and second at * times it is necessary to access data from lower down in the call * stack, so even if it had been implemented that way I would have * needed some sort of parallel stack) * * When a node is evaluated it is pushed onto the stack. If it * succeeds another repetition is attempted if it is not already at * the maximum. If that fails the next chained node is attempted, and * if that fails the node is backed off. * * In the failure mode processing the topmost node is examined to see * if it can be altered - for instance, if it is an OR it can try a * later alternative, or a repeated node can be backed off. Once a new * possibility is found forward processing resumes. * * Eventually (unless of a programming error) processing in the RE * tree will either return to the initial node, in which case the * search failed, or reach the end of the tree (the SUCCESS node) in * which case it succeeded. * ******************************************************************/ /** Build command: To build with the debugging aids, make sure DEBUG is defined in re.h and type > gcc -o re1 re1.c re-debug.c re-parse.c unicode/*.c With no debugging needed re-debug.c is not needed: > gcc -o re1 re1.c re-parse.c unicode/*.c **/ //////////////////////////////////////////////////// // // State frame handling functions // //////////////////////////////////////////////////// // stack->ptr points to the NEXT frame, the current one is at ptr - 1 VD_INT4 VDI_topFrame(struct stateStack *stack) {return stack->ptr - 1;} static VD_INT4 popFrame(struct stateStack *stack) { stack->ptr--; return VDI_topFrame(stack); } struct state *VDI_getState(struct stateStack *stack, VD_INT4 which) { if (which < 0) { which = stack->ptr + which; } return stack->states + which; } // Initializes a new frame and pushes it on the stack, growing the // stack size if necessary // Return stack memory static void cleanupStack(struct stateStack *stack) { free(stack->states); free(stack); } // Gets the next char, but also interprets \xxx escape sequences VD_UCS4 VDI_nextChar(const VD_UTF8 *str, VD_INT4 *poffset, VD_INT4 length) { VD_UCS4 ch, ch1; VD_UTF8 *p; ch = VD_Unicode_UTF8GetIncr(str, poffset, length); if (ch == VD_RE_ESCAPE_CHAR) { ch1 = strtol(str + *poffset, (char **) &p, 0); if (str + *poffset < p) { *poffset += p - str - *poffset; ch = ch1; } } return ch; } // This does a little more than just pushing a state, it also // cherry-picks the easy pushes, to save time and trouble later on static VD_INT4 pushNewStates(struct stateStack *stack, struct node *node, VD_UINT4 count, VD_UTF8 *ptr, VD_INT4 parent) { struct state *state = NULL; VD_INT4 ret = 0; while (node) { DODEBUG(stack->env & E_STATS, stack->pushed++); if (stack->ptr >= stack->allocated - 1) { if (stack->allocated >= MAXSTACKDEPTH) { printf("Exceeded maximum stack depth %d\n", stack->allocated); exit(1); } stack->allocated += ALLOCUNIT; stack->states = realloc(stack->states, stack->allocated*sizeof(struct state)); } state = stack->states + stack->ptr; state->node = node; state->ptr = ptr; state->parent = parent; if (!(node->type & N_LOGIC) && !count && node->min) { count = 1; // leave off 0 reps for these } state->count = count; if (node->type & N_LAZY) { state->nextAction = (count < node->min) ? A_REPEAT_LAZY : A_CHAIN_LAZY; } else { state->nextAction = (count < node->max) ? A_REPEAT_GREEDY : A_CHAIN_GREEDY; } #ifdef DEBUG if (stack->ptr >= stack->maxSize) { stack->maxSize = stack->ptr + 1; } #endif ret = stack->ptr++; // cherry pick the "no brainer" repeats count = 0; if (node->type & N_LOGIC) { if (state->nextAction & A_REPEAT) { // if it is a repeat then push its first child node = node->d.logic.root; parent = ret; } else if (node->link && !(node->type & N_ORCHILD)) { // else if it is a link and no other processing is needed, push the link state->nextAction >>= 1; node = node->link; } else break; } else break; } return ret; } ///////////////////////////////////////////////////// // // checking functions: // check if a node matches the string or not // ///////////////////////////////////////////////////// // Checks if a STRING matches the current pointer location. // It takes into account special characters static VD_UTF8 *matchString(VD_UTF8 *str, VD_INT4 strlen, VD_UTF8 *re, VD_INT4 relen, VD_UINT4 *penv) { VD_UTF8 *end = re + relen; VD_INT4 result; while (re < end) { result = VDI_matchChar(&re, str, strlen, penv); if (result < 0) { return NULL; } str += result; strlen -= result; } return str; } // checks if a character is a member of a string (for OR) static VD_UTF8 *matchOrString(VD_UTF8 *str, VD_INT4 strlen, VD_UTF8 *re, VD_INT4 relen, VD_UINT4 *penv) { VD_UTF8 *end = re + relen; VD_INT4 result; VD_INT4 i = 0; while (re < end) { result = VDI_matchChar(&re, str, strlen, penv); if (result > 0) { VDI_nextChar(str, &i, strlen); return str + i; } } return NULL; } static VD_UTF8 *matchNotOrNode(struct node *node, VD_UTF8 *str, VD_INT4 strlen, VD_UINT4 *penv) { VD_INT4 p = 0; VD_UINT4 ch = VDI_nextChar(str, &p, strlen); if (!p) { return NULL; } while (node) { if (((node->type & N_MEMBER) && matchOrString(str, strlen, node->d.string.start, node->d.string.len, penv)) || ((node->type & N_RANGE) && ((node->d.range.low <= ch) && (ch <= node->d.range.high)))) { return NULL; } node = node->link; } return str + p; } // This does the special processing for each node type. For logic types it // does nothing, for string types it does the needed compare static VD_UTF8 *doChecks(struct stateStack *stack, struct state *state, VD_UTF8 *ptr) { struct node *node = NULL; VD_INT4 p = 0; VD_UINT4 ch; VD_INT4 strlen = stack->strEnd - ptr; if (state->count) { // do not test the 0th entry node = state->node; switch (node->type & (N_STRING | N_MEMBER | N_RANGE | N_NOTOR)) { case N_STRING: ptr = matchString(ptr, strlen, node->d.string.start, node->d.string.len, &stack->env); break; case N_MEMBER: ptr = matchOrString(ptr, strlen, node->d.string.start, node->d.string.len, &stack->env); break; case N_RANGE: ch = VDI_nextChar(ptr, &p, strlen); ptr = ((node->d.range.low <= ch) && (ch <= node->d.range.high)) ? ptr + p : NULL; break; case N_NOTOR: ptr = matchNotOrNode(node->d.logic.root, ptr, strlen, &stack->env); break; case 0: break; default: printf("Very unexpected node value of 0X%x\n", node->type); } } // else if (!strlen) { // ptr = NULL; // } state->ptr = ptr; return ptr; } //////////////////////////////////////////////////// // // Main control functions: // // Manipulate the stack and walk along the parse tree // until the string is matched or the stack is empty // //////////////////////////////////////////////////// // When the search process has succeeded to a point, this routine // finds the next node to push on the stack. // // When the search is finished it will navigate its way up to the SUCCESS node static VD_INT4 getNext(struct stateStack *stack, VD_UTF8 *ptr, VD_INT4 current) { struct state *state = NULL; while (1) { state = VDI_getState(stack, current); if (state->nextAction & A_REPEAT) { return pushNewStates(stack, state->node, state->count + 1, ptr, state->parent); } if (state->node->type & N_ORCHILD) { // this gets done below; the conditions could be combined, but // this way the code is clearer // current = state->parent; } else if (state->node->link) { return pushNewStates(stack, state->node->link, 0, ptr, state->parent); } current = state->parent; } } // When a tested node has failed frames must be popped off until // one is reached that has an option: an OR with more possibilities, // or a repeat count that can be removed. // Logic: // // 1) Topmost node has failed, pop it off and look at next node // 2) IF current node is an OR && the failed node is one of its alternatives // && there are more alternatives THEN push the next alternative and return // 3) IF the node is not in FAILED state AND its count is >= the minimum THEN shift to the // next state and get the next node static VD_INT4 backoff(struct stateStack *stack) { VD_INT4 current = VDI_topFrame(stack); struct state *followingState = NULL, *state = VDI_getState(stack, current); DODEBUG(stack->env & E_SHOWDEBUG, printf("FAIL\n")); DODEBUG(stack->env & E_STATS, stack->failed++); while (stack->ptr > 1) { followingState = state; current = popFrame(stack); state = VDI_getState(stack, current); if ((state->node->type & N_OR) // this is an OR... && (followingState->parent == current) // AND the following node is its child... && (followingState->node->link)) { // AND there are more possibilities return pushNewStates(stack, followingState->node->link, 0, state->ptr, current); } if (state->nextAction & A_NOTFAIL) { // hasn't failed if (state->count >= state->node->min) { // achieved minimum count state->nextAction >>= 1; // fallback to next action return getNext(stack, state->ptr, current); // get next } } } return -1; } // Does the actual work: walks along the tree and moves the frame stack // until the RE is either matched or fails // return 0 if the call is interrupted, 1 if it has completed static VD_INT4 traverse(struct stateStack *stack) { struct state *state = VDI_getState(stack, -1); VD_UTF8 *ptr = state->ptr; do { ptr = doChecks(stack, state, state->ptr); (ptr) ? getNext(stack, ptr, VDI_topFrame(stack)) : backoff(stack); DODEBUG(stack->env & E_STACK, dbStack(stack)); state = VDI_getState(stack, -1); if (stack->iterations++ >= VD_RE_BREAK_ITERATIONS) { return 0; } } while ((stack->ptr > 1) && (state->node->type != N_SUCCESS)); return 1; } /***************************************************************** * * RUNNING : Following are the control functions for running the * searcher, including both entry points for calling it and, if * compiled in, the MAIN entry point and test functions for * debugging and testing * ******************************************************************/ // Some special instructions can be given at the beginning of the RE. // For now these include anchoring the beginning of the match and // turning on or off greedy matching. // Does this belong in PARSE? static VD_INT4 checkInitialChar(VD_UTF8 *re, VD_INT4 *penv) { VD_INT4 len = 0, chlen = 0; while (VDI_nextChar(re, &chlen, -1) == VD_RE_ESCAPE_CHAR) { switch (VDI_nextChar(re, &chlen, -1)) { case '~': *penv |= E_LOOKINGAT; break; default: return len; } len += chlen; re += chlen; chlen = 0; } return len; } // initiates the search after the stack is set up elsewhere, and also // resumes it after interrupting for the kernel VD_INT4 VD_RESearchContinue(struct stateStack *stack) { VD_INT4 offset; stack->iterations = 0; do { if (!traverse(stack)) { DODEBUG(stack->env & E_SHOWDEBUG, printf("Interrupting search\n")); return VD_INTERRUPTED_CALL; } if ((stack->ptr > 1) || (stack->env & E_LOOKINGAT)) { break; } // match failed, reset the stack for the next try stack->ptr = 1; stack->states[0].nextAction = A_REPEAT_GREEDY; offset = 0; VDI_nextChar(stack->states[0].ptr, &offset, -1); stack->states[0].ptr += offset; // try next start char } while (stack->states[0].ptr < stack->strEnd); DODEBUG(stack->env & E_STATS, dbSummary(stack)); return VD_COMPLETED_CALL; } // Initial entry point for the RE function, it sets up the frame stack to the // initial state and then calls VD_RESearchContinue to do the work VD_INT4 VD_RESearch(struct stateStack *stack, VD_UTF8 *re, VD_INT4 relen, VD_UTF8 *str, VD_INT4 strlen, VD_UINT4 env) { struct node *root = NULL; VD_INT4 len; DODEBUG(env & E_SHOWDEBUG, printf("RE is '%s', length is %d\nstring is '%s', length is %d\n", re, relen, str, strlen)); len = checkInitialChar(re, &env); re += len; relen -= len; stack->states = NULL; stack->ptr = stack->allocated = 0; stack->env = env; stack->strEnd = str + strlen; root = VDI_parseRE(re, relen, env & E_TREE); if (!root) { printf("Failed to parse RE '%s'\n", re); return VD_ERROR_RESULT; } // initialize stack for call DODEBUG(env & E_STATS, stack->failed = stack->pushed = stack->maxSize = 0); pushNewStates(stack, root, 0, str, -1); return VD_RESearchContinue(stack); } /****************************************************************** * * Reporting: these functions build the structure that reports the * results. It is a nested list of all quoted subREs. * ******************************************************************/ // gets a node for the report structure // breaks silently if memory allocation fails static struct VD_RE_Result *getResult(VD_UTF8 *start, VD_INT4 len, struct VD_RE_Result *contains) { struct VD_RE_Result *r = NULL; if (r = malloc(sizeof(struct VD_RE_Result))) { r->start = start; r->len = len; r->contains = contains; r->chain = NULL; } return r; } // returns memory from a RESULT structure void VD_FreeREResults(struct VD_RE_Result *results) { if (results) { VD_FreeREResults(results->contains); VD_FreeREResults(results->chain); free(results); } } // Builds the RESULT structure after a successful search. This is // a nested list with the outer list being the entire search, and // sublists corresponding to nodes marked with \" static struct VD_RE_Result *results(struct stateStack *stack, struct node *terminate) { struct state *state = NULL, *end = NULL; struct VD_RE_Result hold, *r0 = &hold, *r = NULL; VD_UTF8 *p = NULL; state = VDI_getState(stack, -1); hold.chain = NULL; while ((state->node != terminate) && ((state + 1)->node->type != N_SUCCESS)) { stack->ptr++; r = NULL; if (state->node->type & N_QUOTE) { if (state->node->type & N_LOGIC) { if ((state + 1)->parent == stack->ptr - 2) { p = state->ptr; r0->chain = getResult(state->ptr, 0, results(stack, state->node)); r0->chain->len = VDI_getState(stack, -1)->ptr - p; r0 = r0->chain; } } else if (state->count) { p = (state - 1)->ptr; r0->chain = getResult(p, state->ptr - p, NULL); r0 = r0->chain; } } state = VDI_getState(stack, -1); } return hold.chain; } static void freeREStack(struct stateStack *stack) { if (stack->states) { VDI_cleanupTree(stack->states[0].node); free(stack->states); } } struct VD_RE_Result *VD_RESearchResults(struct stateStack *stack) { struct VD_RE_Result *result = NULL; if (stack->ptr > 1) { DODEBUG(stack->env & (E_FINALSTACK | E_STACK), dbStack(stack)); stack->ptr = 1; result = results(stack, NULL); DODEBUG(stack->env & E_RESULTS, dbResults(result, 2)); } freeREStack(stack); return result; } #ifdef STANDALONE // tests have been moved to re-parse.c because they are associated // with a particular syntax extern char *tests[][3]; // Compares the string from a single RESULT node with the expected result VD_UTF8 *compareOneResult(VD_UTF8 *p, VD_INT4 len, VD_UTF8 *expected) { VD_INT4 i; for (i = 0; i < len; i++) { if (*(p + i) != *expected++) { return NULL; } } return expected; } // Compares an entire RESULT tree from a successful search with the // expected result VD_UTF8 *compareResults(struct VD_RE_Result *results, VD_UTF8 *expected) { if (!results || !expected) { return NULL; } while (results) { if (!(expected = compareOneResult(results->start, results->len, expected))) { return NULL; } if (results->contains && ((*expected++ != '<') || !(expected = compareResults(results->contains, expected)) || (*expected++ != '>'))) { return NULL; } results = results->chain; if (results && (*expected++ != ',')) { return NULL; } } return expected; } // Runs all the self-tests VD_INT4 runTests(VD_UINT4 env) { struct VD_RE_Result *results = NULL; struct stateStack stack; VD_INT4 i, failed = 0; VD_UTF8 *r = NULL; VD_UTF8 *x; env |= E_LOOKINGAT; for (i = 0; tests[i][0]; i++) { printf("%4d %20s %20s %30s...", i, tests[i][0], tests[i][1], (tests[i][2]) ? (char *) tests[i][2]: "NO MATCH"); fflush(stdout); if (VD_RESearch(&stack, tests[i][0], strlen(tests[i][0]), tests[i][1], strlen(tests[i][1]), env) == VD_INTERRUPTED_CALL) { while (VD_RESearchContinue(&stack) == VD_INTERRUPTED_CALL); } results = VD_RESearchResults(&stack); r = "passed"; if (results) { if (!(x = compareResults(results, tests[i][2])) || (*x)) { r = "failed"; // dbResults(results, 5); failed++; } VD_FreeREResults(results); } else if (tests[i][2]) { r = "failed"; failed++; } printf(" %s\n", r); } printf("Ran %d tests, %d succeeded, %d failed\n", i, i - failed, failed); return (failed != 0); } // Prints out usage message and exits void help(VD_UTF8 *name) { printf("%s [-SWITCHES...] REGEXP STRING\n", name); printf("\t-x: run tests (add -l to run lazy tests)\n"); printf("\t-a: match must start at the first character ('looking at')\n"); printf("\t-A: search entire string for match (default)\n"); printf("\t-c: ignore case in checking for match\n"); printf("\t-C: compare case in checking for match (default)\n"); #ifdef DEBUG printf("\t-f: print out frames after successful search\n"); printf("\t-F: print out frames after each iteration\n"); printf("\t-t: print out the parse tree\n"); printf("\t-r: print out results structure\n"); printf("\t-s: print out statistics on finish\n"); printf("\t-d: print out various debugging information\n"); printf("\t-z: equivalent to -atrsdf\n"); #endif exit(1); } VD_INT4 getArgs(VD_INT4 argc, char **argv, VD_INT4 *penv) { VD_INT4 i, j, ret = 0; *penv = 0; for (i = 1; (i < argc) && (*argv[i] == '-'); i++) { for (j = 1; argv[i][j]; j++) { switch (argv[i][j]) { case 'x': *penv |= E_RUNTESTS; ret = 1; break; case 'a': *penv |= E_LOOKINGAT; break; case 'A': *penv &= ~E_LOOKINGAT; break; case 'C': *penv &= ~E_NOMATCHCASE; break; case 'c': *penv |= E_NOMATCHCASE; break; #ifdef DEBUG case 'f': *penv |= E_FINALSTACK; break; case 'F': *penv |= E_STACK; break; case 't': *penv |= E_TREE; break; case 'r': *penv |= E_RESULTS; break; case 's': *penv |= E_STATS; break; case 'd': *penv |= E_SHOWDEBUG; break; case 'z': *penv |= (E_TREE | E_RESULTS | E_SHOWDEBUG | E_STATS | E_LOOKINGAT | E_FINALSTACK); break; #endif default: return -1; } } } if (ret) { return ret; } if (argc == i) { *penv |= E_INTERACTIVE; return 0; } return (argc - i == 2) ? i : -1; } char *stripEOL(char *p) { char *p1 = p; if (p) { while (*p++); *(p - 2) = 0; } return p1; } VD_INT4 interactive(VD_UINT4 env) { char regexp[RE_SIZE + 1]; char string[STRING_SIZE + 1]; struct VD_RE_Result *results = NULL; struct stateStack stack; do { printf("Enter regexp: "); fflush(stdout); stripEOL(fgets(regexp, RE_SIZE, stdin)); if (regexp[0]) { printf("Enter string: "); fflush(stdout); stripEOL(fgets(string, STRING_SIZE, stdin)); if (VD_RESearch(&stack, regexp, strlen(regexp), string, strlen(string), env) == VD_INTERRUPTED_CALL) { while (VD_RESearchContinue(&stack) == VD_INTERRUPTED_CALL); } results = VD_RESearchResults(&stack); if (!results) { printf("Search failed\n"); } else { printf("SEARCH SUCCEEDED, %.*s\n", results->len, results->start); VD_FreeREResults(results); } } } while (regexp[0]); return 0; } int main(VD_INT4 argc, char **argv) { struct stateStack stack; struct VD_RE_Result *results = NULL; VD_INT4 i = 0; VD_UINT4 env = 0; VD_UTF8 *re = NULL, *str = NULL; if ((i = getArgs(argc, argv, &env)) < 0) { help(argv[0]); } if (env & E_RUNTESTS) { i = runTests(env); } else if (env & E_INTERACTIVE) { i = interactive(env); } else { re = argv[i++]; str = argv[i]; // Below here is the template code for vd to use if ((i = VD_RESearch(&stack, re, strlen(re), str, strlen(str), env)) == VD_INTERRUPTED_CALL) { while ((i = VD_RESearchContinue(&stack)) == VD_INTERRUPTED_CALL); } results = VD_RESearchResults(&stack); // above here is the template code for vd to use if (!results) { printf("Search failed\n"); i = 1; } else { printf("SEARCH SUCCEEDED, %.*s\n", results->len, results->start); VD_FreeREResults(results); i = 0; } } return i; } #endif