#include #include #include "re.h" /****************************************************************** * regexp parsing functions * * This section is used to parse the regexp and create a tree that is * then used to look for the match. To support other RE syntaxes, it * is simple to replace the parser with one written for the desired * syntax, as long as the form of the parse tree is the same. * * The tree is composed of nodes. Nodes hold repeat count, and type * information, and other data specific to the particular type. They * are linked in chains which must be traversed to complete the * match. There are currently 6 kinds of nodes: * * - STRING nodes: these are literal strings that must match (though * they also recognize special characters like '.') * * - MEMBER nodes: A character must match one of the members of the string * * - RANGE nodes: encode a range as used in OR nodes, ie [a-z] * * - AND nodes: forks the chain to insert a subchain that also needs * to be executed * * - OR nodes: similar to AND, forks a chain, but only one of the * subchain needs to match to succeed. * * - SUCCESS node: appended to the topmost chain, if it is reached * then the search has succeeded * * In addition, there are 2 modification bits that can be set for each node * type: * * - QUOTE bit: matches for this node should be returned in a result structure * * - OR_CHILD bit: The parent of this node is an OR, when it succeeds * it should not do linked nodes, but instead jump to the parent * * As an example, the tree for "abc*d\(efg\[hij\(lm\)\]?)+xyz" would * look like: * * "AND - SUCCESS * | * ab - c{0, -1} - d - AND{1, -1} - xyz * | * efg - OR{0, 1} * | * hij - AND * | * lm * * RE SYNTAX * * One of the advantages of this organization is it is easy to change * the syntax. This describes the syntax used here, by linking in a * different parser a different parser could be used. * * The syntax used here is more similar to standard (elisp or unix) * REs, but not identical - there are a few things I don't think are * done the best way which I changed, and some features are not * implemented. * * I tried to be more consistent in defining which characters are * escaped. The original thought was to escape all special characters, * and have all non-escaped characters match themselves. I still like * this, but compromised on the following: all iteration modifiers * ('*', '+', '?', '{') do not require escaping, everything else does. * This makes it easier to remember, only one type of character needs * escaping (and I considered replacing "*" with "\"*") * * The repeat instructions *, +, and ? function just like expected: * they apply to the unit they follow. In addition there is a counter * {a,b} which requires the number of matches to be >= a and <= b. If * b is not specified then it just requires there be >= a matches. * * I have made the [] construction into a more general OR than * standard. While "\[a-z12\]" will still match any small letter or the * digits 1 or 2, the RE "\[a-m\(no\)p\]" will match any character * from a to m, the string "no", or the character 'p'. However, if * '\~' is used to negate the OR only strings can be used. (it is easy * to provide the functionality, but it is not clear how to work it - * for instance, how many characters would match '\[~a-z\(123\)\]"?) * * Finally, in elisp the "\(\)" construction serves the 2 purposes of * grouping and recording. I have separated them by adding the '\"' * command, which records the following match in the results. The * results are represented in a nested list structure, with the top * level being the entire match. * * SPECIAL CHARACTERS * - \n : match any digit [0-9] * - \a : match any alphabetic [a-z] (or [A-Z] if case matching is turned off) * - \A : match any alphabetic [A-Z] (or [a-z] if case matching is turned off) * - \- : match any alphabetic [a-zA-Z] * - \_ : match any whitespace [ \t\n] * - \\ : match a single '\' char * - \. : match any character * - \~ : (at beginning of RE) anchor beginning * - \$ : (at end of RE) anchor end * - \g : (at beginning of RE) force greedy matching algorthm * - \G : (at beginning of RE) force lazy matching algorthm * - \C : require subsequent alphabetic matches have the same case * - \c : match alphabetic characters regardless of case * * * There are 3 functions that need to be provided by this file, or by * its replacement. They are: * - parseRE: returns a parse tree or NULL for error * - cleanupTree: recursively frees a parse tree * - matchChar: tells if the current RE location matches the current string location * * In addition, there should be an array of tests for testing the * searcher in case it is built standalone. * ******************************************************************/ // This assures that empty string match static struct node empty = {N_STRING | N_STATIC, 1, 1, NULL, {"", 0} #ifdef DEBUG , 0 #endif }; ////////////////////////////////////////////////// // // Node manipulation // ////////////////////////////////////////////////// // make a new node of type TYPE static struct node *getNewNode(VD_INT4 type) { struct node *node = malloc(sizeof(struct node)); node->type = (unsigned char) type; node->min = node->max = 1; node->link = NULL; nextID(&node->id, type); return node; } // Returns a new STRING node static struct node *makeDataNode(VD_UTF8 *re, VD_INT4 end, VD_INT4 string) { struct node *new = getNewNode((string) ? N_STRING : N_MEMBER); new->d.string.start = re; new->d.string.len = end; return new; } // returns a new LOGIC node static struct node *makeLogicNode(VD_UINT4 command) { struct node *new = getNewNode(command); new->d.logic.root = NULL; return new; } // returns a new RANGE node static struct node *makeRangeNode(VD_UCS4 low, VD_UCS4 high) { struct node *new = getNewNode(N_RANGE); //default, reset later if OR new->d.range.low = low; new->d.range.high = high; return new; } //////////////////////////////////////////////// // // Special functions for setting up OR nodes // //////////////////////////////////////////////// // Splice a RANGE node into a STRING node. Be careful about manipulating // the tree so that the iteration in andStringsToOrStrings continues static VD_INT4 replaceRange(struct node *node, VD_UCS4 low, VD_INT4 plow, VD_INT4 ptr) { struct node *n = NULL, *link = NULL; VD_INT4 len = node->d.string.len; VD_UTF8 *start = node->d.string.start; VD_UCS4 high; if (plow) { // if there are earlier characters leave them in the current node node->d.string.len = plow; link = node->link; if (!(node->link = makeDataNode(start + plow, len - plow, 0))) { return 0; } node = node->link; node->link = link; node->type |= N_ORCHILD; } // here there is a string node beginning with the range, // make it into a RANGE node high = VDI_nextChar(start, &ptr, len); // get high char node->type = N_RANGE | N_ORCHILD; node->d.range.low = low; node->d.range.high = high; // if there are leftover characters splice in a STRING node if (ptr < len) { link = node->link; node->link = makeDataNode(start + ptr, len - ptr, 1); node->link->link = n; node->link->type |= N_ORCHILD; } return 1; } // Initially AND strings and OR strings are parsed the same. This // subroutine converts plain strings into linked strings and ranges, // when the '-' character ie encountered. It also marks each node // either with the ORCHILD flag or the NOT flag static VD_INT4 andStringsToOrStrings(struct node *node, VD_INT4 not) { VD_UCS4 ch, low; VD_INT4 ptr, p0, plow; while (node) { node->type |= N_ORCHILD; if (node->type & N_STRING) { node->type = (node->type & ~N_STRING) | N_MEMBER; plow = ptr = 0; ch = VDI_nextChar(node->d.string.start, &ptr, node->d.string.len); // skip first char while (ptr < node->d.string.len) { low = ch; p0 = ptr; ch = VDI_nextChar(node->d.string.start, &ptr, node->d.string.len); if (ch == VD_RE_RANGE_CHAR) { if (!replaceRange(node, low, plow, ptr)) { return 0; } break; } plow = p0; } } else if ((node->type & N_LOGIC) && not) { return 0; } node = node->link; } return 1; } // Function used for \|. It wraps each side of the OR in an AND if // there is more than one element to be matched static struct node *makeOrChild(struct node *node) { struct node *and = NULL; if (node) { if (node->link) { // if there is more than 1 in the chain enclose it in an AND if (!(and = makeLogicNode(N_AND))) { return NULL; } and->d.logic.root = node; node = and; } node->type |= N_ORCHILD; } return node; } //////////////////////////////////////////////////// // // subroutines to set up repeat counts // //////////////////////////////////////////////////// // subroutine to handle multiplicity after strings: the count applies // only to the last character (or 2 chars if it is an escape sequence) static struct node *multAfterString(VD_UTF8 *re, VD_INT4 *pstart, VD_INT4 end, VD_INT4 prev, struct node *node) { if (*pstart < 0) { return node; } if (*pstart < prev) { node->link = makeDataNode(re + *pstart, prev - *pstart, 1); node = node->link; } if (prev < end) { node->link = makeDataNode(re + prev, end - prev, 1); node = node->link; } *pstart = -1; return node; } // common repeat code, used for * + ? {} struct node *doRepeat(struct node *node, VD_UTF8 *re, VD_INT4 len, VD_INT4 *pptr, VD_UINT4 min, VD_UINT4 max, VD_UINT4 *pquote) { VD_INT4 ptr = *pptr; node->min = min; node->max = max; node->type |= *pquote; *pquote = 0; if (VD_Unicode_UTF8GetIncr(re, &ptr, len) == VD_RE_LAZY_MODIFIER) { *pptr = ptr; node->type |= N_LAZY; } return node; } ///////////////////////////////////////// // // general parser help functions // ///////////////////////////////////////// // make a string node of any unprocessed characters after a special character is found static VD_INT4 terminateString(struct node **pnode, VD_UTF8 *re, VD_INT4 *pstart, VD_INT4 end, VD_UINT4 *pquote) { if (*pstart >= 0) { if (!((*pnode)->link = makeDataNode(re + *pstart, end - *pstart, 1))) { return 0; } *pnode = (*pnode)->link; (*pnode)->type |= *pquote; *pquote = 0; *pstart = -1; } return 1; } // help do the bookkeeping for the return from parseRE: pass back the current pointer // and clean up in case of error static struct node *doReturn(VD_INT4 failed, struct node *node) { if (failed) { VDI_cleanupTree(node); return NULL; } // I looked into simplifying some trees here, but I don't think the // savings justify the trouble. return (node->type & N_STATIC) ? node->link : node; } // main parsing routine: this returns a node with the head of the // parse chain for this level, or NULL if there is an error. It // returns the current pointer location in the variable PRE //RWY static struct node *parseREInternal (VD_UTF8 *re, VD_INT4 len, VD_INT4 *pptr, VD_UCS4 terminate) { VD_INT4 p, p0, pprev; VD_UTF8 *pch; struct node *n = NULL; struct node top = {N_STATIC, 1, 1, &empty}; struct node *node = ⊤ VD_INT4 start = -1; VD_UCS4 ch; VD_UINT4 min, max, command, quote = 0; while (1) { pprev = p0; p0 = *pptr; ch = VDI_nextChar(re, pptr, len); if (*pptr == p0) { break; } switch (ch) { case VD_RE_ESCAPE_CHAR: switch(ch = VDI_nextChar(re, pptr, len)) { case VD_RE_OR_CHAR: if (!terminateString(&node, re, &start, p0, "e) || !(node = makeLogicNode(N_OR)) || !(node->d.logic.root = makeOrChild(top.link)) || !(node->d.logic.root->link = makeOrChild(parseREInternal(re, len, pptr, terminate)))) { return doReturn(1, &top); } top.link = node; return doReturn(0, &top); case VD_RE_OPEN_OR_CHAR: command = N_OR; p = *pptr; if (VD_RE_NOT_OR_CHAR == VDI_nextChar(re, &p, len)) { *pptr = p; command = N_NOTOR; } if (!terminateString(&node, re, &start, p0, "e) // if it is NOT then change this node into an AND // note this has the result that each of the nodes is pushed on the stack || !(node = node->link = makeLogicNode(command)) || !(node->d.logic.root = parseREInternal(re, len, pptr, VD_RE_CLOSE_OR_CHAR)) || !andStringsToOrStrings(node->d.logic.root, command == N_NOTOR)) { return doReturn(1, &top); } node->type |= quote; quote = 0; break; case VD_RE_OPEN_AND_CHAR: if (!terminateString(&node, re, &start, p0, "e) || !(node->link = makeLogicNode(N_AND))) { return doReturn(1, &top); } node = node->link; node->type |= quote; quote = 0; if (!(node->d.logic.root = parseREInternal(re, len, pptr, VD_RE_CLOSE_AND_CHAR))) { return doReturn(1, &top); } break; case VD_RE_QUOTE_CHAR: if (!terminateString(&node, re, &start, p0, "e)) { return doReturn(1, &top); } quote = N_QUOTE; break; case VD_RE_CLOSE_OR_CHAR: case VD_RE_CLOSE_AND_CHAR: // case 0: // leftover from asciz strings terminateString(&node, re, &start, p0, "e); return doReturn((terminate != ch), &top); default: // keep track of non-special characters to match if (start < 0) { start = p0; } } break; case VD_RE_STAR_REPEAT_CHAR: node = multAfterString(re, &start, p0, pprev, node); node = doRepeat(node, re, len, pptr, 0, -1, "e); break; case VD_RE_PLUS_REPEAT_CHAR: node = multAfterString(re, &start, p0, pprev, node); node = doRepeat(node, re, len, pptr, 1, -1, "e); break; case VD_RE_QMARK_REPEAT_CHAR: node = multAfterString(re, &start, p0, pprev, node); node = doRepeat(node, re, len, pptr, 0, 1, "e); break; case VD_RE_OPEN_REPEAT_CHAR: node = multAfterString(re, &start, p0, pprev, node); max = -1; min = (unsigned short) strtol(re + *pptr, (char **) &pch, 0); *pptr += (pch - re - *pptr); ch = VDI_nextChar(re, pptr, -1); if (',' == ch) { max = (unsigned short) strtol(re + *pptr, (char **) &pch, 0); *pptr += (pch - re - *pptr); ch = VDI_nextChar(re, pptr, -1); } if ((VD_RE_CLOSE_REPEAT_CHAR != ch) || (min > max)) { return doReturn(1, &top); } node = doRepeat(node, re, len, pptr, min, max, "e); break; default: if (start < 0) { start = p0; } } } if (start >= 0) { node->link = makeDataNode(re + start, *pptr - start, 1); node->link->type |= quote; } return doReturn(terminate, &top); } // This creates the parse tree from the input RE. The tree always // starts with an AND node that contains the tree contents, and this // AND node is followed by a SUCCESS node: // // "AND - SUCCESS // | // (re tree) ... // . // . // . // // The SUCCESS node assures the search terminates at the right time // and the "AND makes sure the overall results are reported struct node *VDI_parseRE(VD_UTF8 *str, VD_INT4 len, VD_UINT4 debug) { struct node *root = makeLogicNode(N_AND); VD_INT4 ptr = 0; root->type |= N_QUOTE; if (!(root->d.logic.root = parseREInternal(str, len, &ptr, 0))) { printf("error parsing regexp '%s' around position %d\n", str, ptr); return NULL; } root->link = getNewNode(N_SUCCESS); DODEBUG(debug, dbTree(root)); return root; } // return all memory allocated for the regexp tree void VDI_cleanupTree(struct node *node) { if (node) { if (node->link) { VDI_cleanupTree(node->link); } if (node->type & N_LOGIC) { VDI_cleanupTree(node->d.logic.root); } if (!(node->type & N_STATIC)) { free(node); } } } /****************************************************************** * * Functions for matching characters * * Originally this was in the main file, but string matching is * related to parsing. The idea is, special characters can be * redefined by the syntax, but need to be evaluated in the execution. * One way to do it would be to make special node types for special * characters and break up the STRING nodes into chains of regular * characters separated by special characters. That is ugly, and could * cause other problems with things like repeat counts. This is a * compromise solution: it is simple and flexible, but it does move * some of the traversal execution into the parser module. * * The RE is always updated to reflect its length. The return value is * the number of bytes to advance the string pointer if it succeeds, * or negative to indicate no match. * ******************************************************************/ // RWY TODO: this needs to use the unicode database VD_INT4 VDI_matchChar(VD_UTF8 **re, VD_UTF8 *str, VD_INT4 strlen, VD_UINT4 *penv) { VD_INT4 pre = 0, pstr = 0; VD_INT4 cre = VDI_nextChar(*re, &pre, -1); VD_INT4 cstr = VDI_nextChar(str, &pstr, strlen); VD_INT4 ret = 0; if (cre == VD_RE_ESCAPE_CHAR) { cre = VDI_nextChar(*re, &pre, -1); switch (cre) { case 'n': ret = (('0' <= cstr) && (cstr <= '9')); break; case 'a': ret = (('a' <= cstr) && (cstr <= 'z')) || ((*penv & E_NOMATCHCASE) && ('A' <= cstr) && (cstr <= 'Z')); break; case 'A': ret = (('A' <= cstr) && (cstr <= 'Z')) || ((*penv & E_NOMATCHCASE) && ('a' <= cstr) && (cstr <= 'z')); break; case '-': ret = (('A' <= cstr) && (cstr <= 'Z')) || (('a' <= cstr) && (cstr <= 'z')); break; case 'C': *penv &= ~E_NOMATCHCASE; ret = 1; pstr = 0; break; case 'c': *penv |= E_NOMATCHCASE; ret = 1; pstr = 0; break; case '_': ret = ((cstr == ' ') || (cstr == '\t') || (cstr == '\n')); break; case '\\': ret = (cstr == '\\'); break; case '.': ret = (cstr != 0); break; case '~': ret = 0; break; // processed elsewhere case '$': ret = (strlen == 0); break; default: ret = (cstr == cre); } } else { ret = (cstr == cre); if (!ret && (*penv & E_NOMATCHCASE)) { cstr -= cre; ret = ((cstr == 'A' - 'a') || (cstr == 'a' - 'A')); } } *re += pre; return (ret) ? pstr : -1; } #ifdef STANDALONE // tests are run in the main file but defined in parser, since // different parsers would use different test strings // // REGEXP, DATA, EXPECTED_RESULT // The expected result simulates nested lists by using <> to surround // sublists and , to separate elements of a list. If the test is not // supposed to find a match use NULL for the expected result char *tests[][3] = { {"a\\(bc\\)*d", "ad", "ad"}, {"a\\(bc\\)*d", "abcd", "abcd"}, {"a\\(bc\\)*d", "abcbcd", "abcbcd"}, {"a\\(bc\\)*", "a", "a"}, {"a\\(bc\\)*", "abc", "abc"}, {"a\\(bc\\)*", "abcbc", "abcbc"}, {"a\\\"\\(bc\\)*bcd", "abcbcbcd", "abcbcbcd"}, {"a\\(bc\\)+d", "ad", NULL}, {"a\\(bc\\)+d", "abcd", "abcd"}, {"a\\(bc\\)+d", "abcbcd", "abcbcd"}, {"a\\(bc\\)+", "a", NULL}, {"a\\(bc\\)+", "abc", "abc"}, {"a\\(bc\\)+", "abcbc", "abcbc"}, {"a\\\"\\(bc\\)+bcd", "abcbcbcd", "abcbcbcd"}, {"a\\(bc\\)?d", "ad", "ad"}, {"a\\(bc\\)?d", "abcd", "abcd"}, {"a\\(bc\\)?d", "abcbcd", NULL}, {"a\\(bc\\)?", "a", "a"}, {"a\\(bc\\)?", "abc", "abc"}, {"a\\(bc\\)?", "abcbc", "abc"}, {"a\\\"\\(bc\\)?bcd", "abcbcbcd", NULL}, {"\\(bc\\)*", "ssss", ""}, {"\\(bc\\)*", "bc", "bc"}, {"\\(bc\\)*", "bcbc", "bcbc"}, {"a\\(bc\\)?d", "ad", "ad"}, {"a\\(bc\\)?d", "abcd", "abcd"}, {"a\\(bc\\)?d", "abcbcd", NULL}, {"a\\(bc\\){3,4}d", "abcbcd", NULL}, {"a\\(bc\\){3,4}d", "abcbcbcd", "abcbcbcd"}, {"a\\(bc\\){3,4}d", "abcbcbcbcd", "abcbcbcbcd"}, {"a\\(bc\\){3,4}", "abcbcbcbcd", "abcbcbcbc"}, {"a\\(bc\\){3,4}d", "abcbcbcbcbcd", NULL}, {"a\\(bc\\)+d", "ad", NULL}, {"a\\(bc\\)+d", "abcd", "abcd"}, {"a\\(bc\\)+d", "abcbcd", "abcbcd"}, {"a*b", "b", "b"}, {"a+b", "b", NULL}, {"\\\"a*", "aaaaab", "aaaaa"}, {"a*", "aaaaa", "aaaaa"}, {"\\(ab\\)*abc", "ababababcx", "ababababc"}, {"\\\"\\(ab\\)*abc", "ababababcx", "ababababc"}, {"\\\"\\(a\\(bc\\)*\\)*z", "aaabcbcaz", "aaabcbcaz"}, {"\\(a\\(b\\(c\\)*\\)*\\)*", "abbcccbcaa", "abbcccbcaa"}, {"\\[ab\\]", "b", "b"}, {"\\[ab\\]", "a", "a"}, {"\\[ab\\]", "c", NULL}, {"x\\[ab\\]*", "x", "x"}, {"x\\[ab\\]+", "x", NULL}, {"a\\\"\\[bcd\\(ef\\)\\]*de", "abefefde", "abefefde"}, // following is infinite loop - acceptable? // {"\\(a*\\)*", "aaaa", "aaaa"}, // bug: make match of empty subexpression terminate {"bc*\\|x+y", "bcccxxxybxydlk", "bccc"}, {"a\\(bc*\\|x+y\\)+z", "abcccxxxybxyzzz", "abcccxxxybxyz"}, {"\\(bc*\\|x+y\\)*", "bcccxxxybxydlk", "bcccxxxybxy"}, {"a\\n*b", "a123b", "a123b"}, {"a\\n*b", "a12x3b", NULL}, {"\\\"\\[\\(ab\\)\\(cd\\)+\\]+ab", "cdcdababx", "cdcdabab"}, {"a\\\"\\.*z", "amnopqzzxy", "amnopqzz"}, {"a\\\"\\(\\.*\\)z", "amnopqzzxyzxx", "amnopqzzxyz"}, {"a\\[abc\\]*", "acbABC", "acb"}, {"a\\c\\[abc\\]*", "acbABC", "acbABC"}, {"\\[~abc\\]*", "wxyzabc", "wxyz"}, {"\\[~ab-de\\]*", "wxyzc", "wxyz"}, {"\\[~ab-de\\]*", "wxyz", "wxyz"}, {"\\\"\\[~ab-de\\]*z", "wxyz", "wxyz"}, {"\\[~\\na\\]*", "wxyzabc", "wxyz"}, {"\\[~\\na\\]*", "wxy5zabc", "wxy"}, {"\\[a-c\\]+", "ab-cbe", "ab"}, {"abc", "", NULL}, {"a*", "", ""}, {"", "a", ""}, {"\\.*", "abcd", "abcd"}, {"\\[-ac\\]+", "ac-cbe", "ac-c"}, {"a\\(\\){3,4}b", "abc", "ab"}, {"\\~abcd", "abcde", "abcd"}, {"abcd\\$", "abcd", "abcd"}, {"abcd\\$", "abcde", NULL}, {"\\[\\62-\\67\\]*x\\0104\\0x45z", "ABCxDEz", "ABCxDEz"}, {"\\[a-ce-gik-m\\]+", "abcefgiklmd", "abcefgiklm"}, {"a\\\"\\(bc\\)+bc", "abcbcbcbcbc", "abcbcbcbcbc"}, {"a\\\"\\(bc\\)+?bc", "abcbcbcbcbc", "abcbc"}, {"\\\"a*?a*?b", "aaaaaab", "aaaaaab"}, {"\\\"a*a*?b", "aaaaaab", "aaaaaab"}, {"\\\"\\[\\(ab\\)+?cd\\]*", "cdababc", "cdababc"}, {"\\\"\\[\\(ab\\)+cd\\]*", "cdababc", "cdababc"}, {"ab\\\"\\(cd\\|ef*\\)+", "abeecdcdefffg", "abeecdcdefff"}, {"\\\\|\"])[(\\}\\{\\*\\+\\?\\z", "\\|\"])[(}{*+?z", "\\|\"])[(}{*+?z"}, {NULL, NULL, NULL} }; #endif