///////////////////////////////////////////////////// // // DEBUGGING STUFF // // if DEBUG and/or STANDALONE are defined, these set up the // program conditions // /////////////////////////////////////////////////////// #define DEBUG 1 // include and call debugging output functions #define STANDALONE 1 // include the MAIN function // used for interactive calls #define RE_SIZE 100 // maximum size of RE #define STRING_SIZE 150 // maximum size of string /////////////////////////////////////////////////////////// // // EXECUTION PARAMETERS // // Following are defines that adjust the execution environment // They can be changed to make it faster or more efficient // //////////////////////////////////////////////////////////// // number of state frames to allocate at a time on the stack #define ALLOCUNIT 10 // set the maximum depth of the stack. Actually it only checks when // new frames are allocated, so the maximum possible stack size is // MAXSTACKDEPTH + ALLOCUNIT - 1 #define MAXSTACKDEPTH 200 // should be bigger? Should be runtime? ////////////////////////////////////////////////////////////// // // VD STUFF // ////////////////////////////////////////////////////////////// #include "unicode/vd_unicode.h" // this seems not to be in the standalone VD files I have typedef signed short VD_INT2; // vd return values #define VD_ERROR_RESULT -1 #define VD_INTERRUPTED_CALL 0 #define VD_COMPLETED_CALL 1 #define VD_RE_BREAK_ITERATIONS 200 #define VD_RE_RANGE_CHAR '-' #define VD_RE_ESCAPE_CHAR '\\' #define VD_RE_LOOKINGAT_CHAR '~' #define VD_RE_OR_CHAR '|' #define VD_RE_OPEN_OR_CHAR '[' #define VD_RE_CLOSE_OR_CHAR ']' #define VD_RE_OPEN_AND_CHAR '(' #define VD_RE_CLOSE_AND_CHAR ')' #define VD_RE_NOT_OR_CHAR '~' #define VD_RE_QUOTE_CHAR '"' #define VD_RE_OPEN_REPEAT_CHAR '{' #define VD_RE_CLOSE_REPEAT_CHAR '}' #define VD_RE_STAR_REPEAT_CHAR '*' #define VD_RE_PLUS_REPEAT_CHAR '+' #define VD_RE_QMARK_REPEAT_CHAR '?' #define VD_RE_LAZY_MODIFIER '?' ///////////////////////////////////////////////////////////////// // // PARSE TREE NODE STRUCTURE // // Used to build the parse tree when the RE is read. TRAVERSE steps // along this , finishing either when the stack is empty or when it // reaches the strategically placed SUCCESS node at the final leaf of // the parse tree // ////////////////////////////////////////////////////////////////// // Node types (unsigned short node.type) #define N_SUCCESS 0x0 #define N_AND 0x1 #define N_OR 0x2 #define N_LOGIC (N_OR | N_AND) // For convenience #define N_NOTOR 0x4 #define N_RANGE 0x8 #define N_STRING 0x10 #define N_MEMBER 0x20 #define TYPEMASK 0x3f // used to assign numbers to the nodes for debugging // node type modifiers #define N_LAZY 0x40 // use lazy evaluation (default is greedy) #define N_QUOTE 0x80 // Mark a node for inclusion in the result #define N_ORCHILD 0x100 // parent is an OR #define N_NOT 0x200 // parent is an OR NOT // mark a node as not to be returned in the result // This is used for various system nodes like EMPTY or ROOT #define N_STATIC 0x400 struct node { VD_UINT4 type; // node type: N_* VD_UINT4 min, max; // number of times node can be walked struct node *link; // link to trailing unit union { struct string { // string to be matched VD_UTF8 *start; VD_INT4 len; } string; struct logic { // AND or OR nodes struct node *root; } logic; struct range { // string to be a member of VD_UCS4 low; VD_UCS4 high; } range; } d; #ifdef DEBUG VD_INT4 id; // used to identify the node in debugging output #endif }; ///////////////////////////////////////////////////////// // // STATE STRUCTURE // // The STATE structure is kept on a stack during traversal and // represents the current state as the program walks over the // parse tree. // ///////////////////////////////////////////////////////// // values for the state.nextAction field // This is neat - originally the code had to check in several places // for lazy or not. Now, when properly initalized, both can be checked by // the common bit mask, and advanced by shifting right 1 bit. #define A_REPEAT_LAZY 0x1 #define A_CHAIN_LAZY 0x2 #define A_CHAIN_GREEDY 0x10 #define A_REPEAT_GREEDY 0x20 // these definitions check for the desired state, for both LAZY and GREEDY #define A_REPEAT (A_REPEAT_LAZY | A_REPEAT_GREEDY) #define A_CHAIN (A_CHAIN_LAZY | A_CHAIN_GREEDY) #define A_NOTFAIL (A_CHAIN | A_REPEAT) struct state { struct node *node; // when the frame is created PTR holds the starting point of the // match. If the match succeeds it is replaced with the end point VD_UTF8 *ptr; VD_INT4 parent; // parent frame VD_UINT4 count; // number of SUCCESSFUL reps (so it starts with 0) VD_UINT4 nextAction; // see comment for A_* }; ///////////////////////////////////////////////////////////// // // STATE STACK STRUCTURE // // The stack that holds the states representing the current RE // match position. It is used to backtrack in case of failure, // and to reconstruct the solution after successful completion /////////////////////////////////////////////////////////////// // Environment codes, for debugging and state info, used in stack->env #define E_STACK 0x1 // print out the stack on each rep (debug) #define E_FINALSTACK 0x2 // print out the stack on finish (debug) #define E_TREE 0x4 // print out the parse tree (debug) #define E_SHOWDEBUG 0x8 // print out various debugging info (debug) #define E_RESULTS 0x10 // print out the RESULTS structure (debug) #define E_RUNTESTS 0x20 // run the tests #define E_LOOKINGAT 0x40 // only match string starting in first position #define E_INTERACTIVE 0x80 // accept interactive RE and string #define E_STATS 0x100 // print out stats after completion (debug) #define E_NOMATCHCASE 0x200 // do not require case match struct stateStack { struct state *states; // states stack VD_UTF8 *strEnd; // pointer to the end of the string #ifdef DEBUG // debugging information VD_INT4 maxSize; // maximum size of stack VD_INT4 pushed; // total number of frames pushed on the stack VD_INT4 failed; // number of frames that failed compare #endif VD_INT4 iterations; // number of iterations (use to break for VD) VD_INT4 ptr; // pointer to next location (1 over current top) VD_INT4 allocated; // number of stack frames allocated VD_UINT4 env; // bits for debugging }; ///////////////////////////////////////////////////////////////// // // RESULT STRUCTURE // // Used to represent the output of a successful search. This // forms a nested list where the top level is the entire matched // search string, and lower layers represent quoted expressions // ///////////////////////////////////////////////////////////////// struct VD_RE_Result { struct VD_RE_Result *chain; // nested results list struct VD_RE_Result *contains; // next member of this result list VD_UTF8 *start; // start of this matched string VD_INT4 len; // length of this matched string }; // predeclarations see documentation in re1.c // (probably there should be 2 include files, a public one and // a private one) // internal use //gets the requested state from off the stack. If WHICH < 0 counts //down from the top (-1 is the top frame) struct state *VDI_getState(struct stateStack *stack, VD_INT4 which); // gets the number of the top frame on the stack VD_INT4 VDI_topFrame(struct stateStack *stack); // Gets the next character. It uses VD_UNICODE_UTF8GETINCR, but also // handles literal numbers (that is, \0xabc) VD_UCS4 VDI_nextChar(const VD_UTF8 *, VD_INT4 *, VD_INT4); // see documentation in re-parse.c // builds the parse tree for the regexp struct node *VDI_parseRE(VD_UTF8 *, VD_INT4, VD_UINT4); // returns resources used by the tree void VDI_cleanupTree(struct node *node); // called to check for character matches - since this can be // syntax dependent it needs to be in the parse file VD_INT4 VDI_matchChar(VD_UTF8 **, VD_UTF8 *, VD_INT4, VD_UINT4 *); //int strlen(char *); // in re-debug.c // If debugging is enabled predeclare the functions, otherwise // macro them out #ifdef DEBUG #define DODEBUG(xx, yy) {if (xx) {yy;}} void dbSummary(struct stateStack *stack); void dbStack(struct stateStack *stack); void dbTree(struct node *root); void dbState(struct stateStack *stack, VD_INT4 depth); void dbNode(struct node *node, VD_INT4 recurse); void nextID(VD_INT4 *, VD_INT4); void dbResults(struct VD_RE_Result *, VD_INT4); #else #define DODEBUG(xx, yy) #define dbStack(xxx) #define dbSummary(xxx) #define dbTree(xxx) #define dbState(xxx, yyy) #define dbNode(xxx, yyy) #define nextID(xxx, yyy) #define dbResults(xxx, yyy) #endif