#include <stdlib.h>
#include <stdio.h>
#include "re.h"

/******************************************************************
 * OUTLINE
 *
 * This design first parses the search specification to come up with a
 * parse tree, and then steps along that until it either finishes or
 * fails. This has a few good results: first, changes in the RE syntax
 * are isolated in the parser, and so are relatively easy to make
 * without touching the trickier parts of the program.
 *
 * Second, stepping along the tree is a natural way of doing this. For
 * example, in an earlier program I had a lot of trouble recognizing
 * when the search was complete. In the new way, by attaching a
 * SUCCESS node to the end of the tree, when the SUCCESS node is
 * reached the search is finished.
 *
 * So, the following program is in 3 parts. These are the RE parser,
 * the traverser, and the reporter. The parser parses the RE to get a
 * tree, the traverser walks the tree, leaving a stack trail
 * describing where it goes, and the reporter then deciphers that
 * trail to provide the RE match. More detail is provided in the
 * headers for each section.
 *
 *
 ******************************************************************/

/******************************************************************
 *              TODO 9/24
 * - better error handling
 * - program failure, ie memory allocation
 *
 *****************************************************************/

/******************************************************************
 * Traversal
 *
 * This section takes a parsed RE tree and tries to match it to the
 * string. It works by simultaneously keeping track of 2 structures:
 * its location in the RE parse tree, and the frame stack that keeps
 * track of the state. (at first I thought to do this with recursive
 * function calls, but there were 2 problems: first, I worried about
 * the overhead and size in evaluating complex regexps, and second at
 * times it is necessary to access data from lower down in the call
 * stack, so even if it had been implemented that way I would have
 * needed some sort of parallel stack)
 *
 * When a node is evaluated it is pushed onto the stack. If it
 * succeeds another repetition is attempted if it is not already at
 * the maximum. If that fails the next chained node is attempted, and
 * if that fails the node is backed off.
 *
 * In the failure mode processing the topmost node is examined to see
 * if it can be altered - for instance, if it is an OR it can try a
 * later alternative, or a repeated node can be backed off. Once a new
 * possibility is found forward processing resumes. 
 * 
 * Eventually (unless of a programming error) processing in the RE
 * tree will either return to the initial node, in which case the
 * search failed, or reach the end of the tree (the SUCCESS node) in
 * which case it succeeded.
 *
 ******************************************************************/

/**
   Build command:

   To build with the debugging aids, make sure DEBUG is defined in re.h and type

   > gcc -o re1 re1.c re-debug.c re-parse.c unicode/*.c

   With no debugging needed re-debug.c is not needed:

   > gcc -o re1 re1.c re-parse.c unicode/*.c

**/ 

////////////////////////////////////////////////////
//
// State frame handling functions
//
////////////////////////////////////////////////////

// stack->ptr points to the NEXT frame, the current one is at ptr - 1
VD_INT4 VDI_topFrame(struct stateStack *stack) {return stack->ptr - 1;}

static VD_INT4 popFrame(struct stateStack *stack) {
    stack->ptr--;
    return VDI_topFrame(stack);
}

struct state *VDI_getState(struct stateStack *stack, VD_INT4 which) {
    if (which < 0) {
        which = stack->ptr + which;
    }
    return stack->states + which;
}

// Initializes a new frame and pushes it on the stack, growing the
// stack size if necessary
// Return stack memory 
static void cleanupStack(struct stateStack *stack) {
    free(stack->states);
    free(stack);
}

// Gets the next char, but also interprets \xxx escape sequences
VD_UCS4 VDI_nextChar(const VD_UTF8 *str, VD_INT4 *poffset, VD_INT4 length) {
    VD_UCS4 ch, ch1;
    VD_UTF8 *p;
    ch = VD_Unicode_UTF8GetIncr(str, poffset, length);
    if (ch == VD_RE_ESCAPE_CHAR) {
        ch1 = strtol(str + *poffset, (char **) &p, 0);
        if (str + *poffset < p) {
            *poffset += p - str - *poffset;
            ch = ch1;
        }
    }
    return ch;
}

// This does a little more than just pushing a state, it also
// cherry-picks the easy pushes, to save time and trouble later on
static VD_INT4 pushNewStates(struct stateStack *stack, struct node *node, VD_UINT4 count, VD_UTF8 *ptr, VD_INT4 parent) {
    struct state *state = NULL;
    VD_INT4 ret = 0;
    while (node) {
        DODEBUG(stack->env & E_STATS, stack->pushed++);
        if (stack->ptr >= stack->allocated - 1) {
            if (stack->allocated >= MAXSTACKDEPTH) {
                printf("Exceeded maximum stack depth %d\n", stack->allocated);
                exit(1);
            }
            stack->allocated += ALLOCUNIT;
            stack->states = realloc(stack->states, stack->allocated*sizeof(struct state));
        }
        state = stack->states + stack->ptr;
        state->node = node;
        state->ptr = ptr;
        state->parent = parent;
        if (!(node->type & N_LOGIC) && !count && node->min) { 
            count = 1;       // leave off 0 reps for these
        }
        state->count = count;
		if (node->type & N_LAZY) {
			state->nextAction = (count < node->min) ? A_REPEAT_LAZY : A_CHAIN_LAZY;
		}
		else {
			state->nextAction = (count < node->max) ? A_REPEAT_GREEDY : A_CHAIN_GREEDY;
		}
#ifdef DEBUG
        if (stack->ptr >= stack->maxSize) {
            stack->maxSize = stack->ptr + 1;
        }
#endif
        ret = stack->ptr++;

        // cherry pick the "no brainer" repeats
        count = 0;
        if (node->type & N_LOGIC) {
            if (state->nextAction & A_REPEAT) {    // if it is a repeat then push its first child
                node = node->d.logic.root;
                parent = ret;
            }
            else if (node->link && !(node->type & N_ORCHILD)) { // else if it is a link and no other processing is needed, push the link
				state->nextAction >>= 1;
                node = node->link;
            }
            else break;
        }               
        else break;
    }
    return ret;
}

/////////////////////////////////////////////////////
//
// checking functions: 
// check if a node matches the string or not
//
/////////////////////////////////////////////////////

// Checks if a STRING matches the current pointer location.
// It takes into account special characters 
static VD_UTF8 *matchString(VD_UTF8 *str, VD_INT4 strlen, VD_UTF8 *re, VD_INT4 relen, VD_UINT4 *penv) {
    VD_UTF8 *end = re + relen;
    VD_INT4 result;
    while (re < end) {
        result = VDI_matchChar(&re, str, strlen, penv);
        if (result < 0) {
            return NULL;
        }
        str += result;
		strlen -= result;
    }
    return str;
}

// checks if a character is a member of a string (for OR)
static VD_UTF8 *matchOrString(VD_UTF8 *str, VD_INT4 strlen, VD_UTF8 *re, VD_INT4 relen, VD_UINT4 *penv) {
    VD_UTF8 *end = re + relen;
    VD_INT4 result;
    VD_INT4 i = 0;
    while (re < end) {
        result = VDI_matchChar(&re, str, strlen, penv);
        if (result > 0) {
            VDI_nextChar(str, &i, strlen);
            return str + i;
        }
    }
    return NULL;
}

static VD_UTF8 *matchNotOrNode(struct node *node, VD_UTF8 *str, VD_INT4 strlen, VD_UINT4 *penv) {
	VD_INT4 p = 0;
    VD_UINT4 ch = VDI_nextChar(str, &p, strlen);
	if (!p) {
		return NULL;
	}
	while (node) {
		if (((node->type & N_MEMBER) 
				&& matchOrString(str, strlen, node->d.string.start, node->d.string.len, penv))
			|| ((node->type & N_RANGE)
				&& ((node->d.range.low <= ch) && (ch <= node->d.range.high)))) {
			return NULL;
		}
		node = node->link;
	}
	return str + p;
}

// This does the special processing for each node type. For logic types it
// does nothing, for string types it does the needed compare
static VD_UTF8 *doChecks(struct stateStack *stack, struct state *state, VD_UTF8 *ptr) {
    struct node *node = NULL;
    VD_INT4 p = 0;
    VD_UINT4 ch;
	VD_INT4 strlen = stack->strEnd - ptr;
    if (state->count) {     // do not test the 0th entry
        node = state->node;
        switch (node->type & (N_STRING | N_MEMBER | N_RANGE | N_NOTOR)) {
        case N_STRING: ptr = matchString(ptr, strlen, node->d.string.start, node->d.string.len, &stack->env);
            break;
        case N_MEMBER: ptr = matchOrString(ptr, strlen, node->d.string.start, node->d.string.len, &stack->env);
            break;
        case N_RANGE:
            ch = VDI_nextChar(ptr, &p, strlen);
            ptr = ((node->d.range.low <= ch) && (ch <= node->d.range.high)) ? ptr + p : NULL;
            break;
		case N_NOTOR:
			ptr = matchNotOrNode(node->d.logic.root, ptr, strlen, &stack->env);
			break;
        case 0: break;
        default: printf("Very unexpected node value of 0X%x\n", node->type);
        }
    }
//	else if (!strlen) {
//		ptr = NULL;
//	}
    state->ptr = ptr;
    return ptr;
}

////////////////////////////////////////////////////
//
// Main control functions:
//
// Manipulate the stack and walk along the parse tree
// until the string is matched or the stack is empty
//
////////////////////////////////////////////////////

// When the search process has succeeded to a point, this routine 
// finds the next node to push on the stack.
//
// When the search is finished it will navigate its way up to the SUCCESS node
static VD_INT4 getNext(struct stateStack *stack, VD_UTF8 *ptr, VD_INT4 current) {
    struct state *state = NULL;
    while (1) {
        state = VDI_getState(stack, current);
        if (state->nextAction & A_REPEAT) {
            return pushNewStates(stack, state->node, state->count + 1, ptr, state->parent);
        }
        if (state->node->type & N_ORCHILD) {
// this gets done below; the conditions could be combined, but
// this way the code is clearer
//	        current = state->parent;
		}
		else if (state->node->link) {
            return pushNewStates(stack, state->node->link, 0, ptr, state->parent);
        }
        current = state->parent;
    }
}

// When a tested node has failed frames must be popped off until 
// one is reached that has an option: an OR with more possibilities, 
// or a repeat count that can be removed.
// Logic:
//
//  1) Topmost node has failed, pop it off and look at next node
//  2) IF current node is an OR && the failed node is one of its alternatives
//      && there are more alternatives THEN push the next alternative and return
//  3) IF the node is not in FAILED state AND its count is >= the minimum THEN shift to the
//		next state and get the next node
static VD_INT4 backoff(struct stateStack *stack) {
    VD_INT4 current = VDI_topFrame(stack);
    struct state *followingState = NULL, *state = VDI_getState(stack, current);
    DODEBUG(stack->env & E_SHOWDEBUG, printf("FAIL\n"));
    DODEBUG(stack->env & E_STATS, stack->failed++);

    while (stack->ptr > 1) {
        followingState = state;
        current = popFrame(stack);
        state = VDI_getState(stack, current);
        if ((state->node->type & N_OR)              // this is an OR...
            && (followingState->parent == current)  // AND the following node is its child...
            && (followingState->node->link)) {      // AND there are more possibilities
            return pushNewStates(stack, followingState->node->link, 0, state->ptr, current);
        }
        if (state->nextAction & A_NOTFAIL) {		// hasn't failed
            if (state->count >= state->node->min) {	// achieved minimum count
				state->nextAction >>= 1;			// fallback to next action
                return getNext(stack, state->ptr, current);	// get next 
            }
        }
    }
    return -1;
}


// Does the actual work: walks along the tree and moves the frame stack
// until the RE is either matched or fails
// return 0 if the call is interrupted, 1 if it has completed
static VD_INT4 traverse(struct stateStack *stack) {
    struct state *state = VDI_getState(stack, -1);
    VD_UTF8 *ptr = state->ptr;
    do {
        ptr = doChecks(stack, state, state->ptr);
        (ptr) ? getNext(stack, ptr, VDI_topFrame(stack)) : backoff(stack);
        DODEBUG(stack->env & E_STACK, dbStack(stack));
        state = VDI_getState(stack, -1);
        if (stack->iterations++ >= VD_RE_BREAK_ITERATIONS) {
            return 0;
        }
    } 
    while ((stack->ptr > 1) && (state->node->type != N_SUCCESS));
    return 1;
}


/*****************************************************************
*
* RUNNING : Following are the control functions for running the
* searcher, including both entry points for calling it and, if
* compiled in, the MAIN entry point and test functions for
* debugging and testing
*
******************************************************************/
// Some special instructions can be given at the beginning of the RE.
// For now these include anchoring the beginning of the match and
// turning on or off greedy matching. 
// Does this belong in PARSE?
static VD_INT4 checkInitialChar(VD_UTF8 *re, VD_INT4 *penv) {
    VD_INT4 len = 0, chlen = 0;
    while (VDI_nextChar(re, &chlen, -1) == VD_RE_ESCAPE_CHAR) {
        switch (VDI_nextChar(re, &chlen, -1)) {
        case '~':
            *penv |= E_LOOKINGAT;
            break;
        default:
            return len;
        }
		len += chlen;
        re += chlen;
        chlen = 0;
    }
    return len;
}

// initiates the search after the stack is set up elsewhere, and also
// resumes it after interrupting for the kernel
VD_INT4 VD_RESearchContinue(struct stateStack *stack) {
    VD_INT4 offset;
    stack->iterations = 0;
    do {
        if (!traverse(stack)) {
            DODEBUG(stack->env & E_SHOWDEBUG, printf("Interrupting search\n"));
            return VD_INTERRUPTED_CALL;
        }
        if ((stack->ptr > 1) || (stack->env & E_LOOKINGAT)) {
            break;
        }
        // match failed, reset the stack for the next try
        stack->ptr = 1;
        stack->states[0].nextAction = A_REPEAT_GREEDY;
        offset = 0;
        VDI_nextChar(stack->states[0].ptr, &offset, -1);
        stack->states[0].ptr += offset;     // try next start char
    } while (stack->states[0].ptr < stack->strEnd);
    DODEBUG(stack->env & E_STATS, dbSummary(stack));
    return VD_COMPLETED_CALL;
}

// Initial entry point for the RE function, it sets up the frame stack to the 
// initial state and then calls VD_RESearchContinue to do the work
VD_INT4 VD_RESearch(struct stateStack *stack, VD_UTF8 *re, VD_INT4 relen, VD_UTF8 *str, VD_INT4 strlen, VD_UINT4 env) {
    struct node *root = NULL;
	VD_INT4 len;
    DODEBUG(env & E_SHOWDEBUG, printf("RE is '%s', length is %d\nstring is '%s', length is %d\n",
        re, relen, str, strlen));
    len = checkInitialChar(re, &env);
	re += len;
	relen -= len;
    stack->states = NULL;
    stack->ptr = stack->allocated = 0;
    stack->env = env;
    stack->strEnd = str + strlen;
    root = VDI_parseRE(re, relen, env & E_TREE);
    if (!root) {
        printf("Failed to parse RE '%s'\n", re);
        return VD_ERROR_RESULT;
    }
    // initialize stack for call
    DODEBUG(env & E_STATS, stack->failed = stack->pushed = stack->maxSize = 0);
    pushNewStates(stack, root, 0, str, -1);
    return VD_RESearchContinue(stack);
}

/******************************************************************
 *
 * Reporting: these functions build the structure that reports the
 * results. It is a nested list of all quoted subREs.
 *
 ******************************************************************/
// gets a node for the report structure
// breaks silently if memory allocation fails
static struct VD_RE_Result *getResult(VD_UTF8 *start, VD_INT4 len, struct VD_RE_Result *contains) {
    struct VD_RE_Result *r = NULL;
    if (r = malloc(sizeof(struct VD_RE_Result))) {
        r->start = start;
        r->len = len;
        r->contains = contains;
        r->chain = NULL;
    }
    return r;
}

// returns memory from a RESULT structure
void VD_FreeREResults(struct VD_RE_Result *results) {
    if (results) {
        VD_FreeREResults(results->contains);
        VD_FreeREResults(results->chain);
        free(results);
    }
}

// Builds the RESULT structure after a successful search. This is
// a nested list with the outer list being the entire search, and 
// sublists corresponding to nodes marked with \"

static struct VD_RE_Result *results(struct stateStack *stack, struct node *terminate) {
    struct state *state = NULL, *end = NULL;
    struct VD_RE_Result hold, *r0 = &hold, *r = NULL;
    VD_UTF8 *p = NULL;
    state = VDI_getState(stack, -1);
    hold.chain = NULL;
    while ((state->node != terminate) && ((state + 1)->node->type != N_SUCCESS)) {
        stack->ptr++;
        r = NULL;
        if (state->node->type & N_QUOTE) {
            if (state->node->type & N_LOGIC) {
                if ((state + 1)->parent == stack->ptr - 2) {
                    p = state->ptr;
                    r0->chain = getResult(state->ptr, 0, results(stack, state->node));
                    r0->chain->len = VDI_getState(stack, -1)->ptr - p;
                    r0 = r0->chain;
                }
            }
            else if (state->count) {
                p = (state - 1)->ptr;
                r0->chain = getResult(p, state->ptr - p, NULL);
                r0 = r0->chain;
            }
        }
        state = VDI_getState(stack, -1);
    }
    return hold.chain;
}

static void freeREStack(struct stateStack *stack) {
	if (stack->states) {
	    VDI_cleanupTree(stack->states[0].node);
		free(stack->states);
	}
}

struct VD_RE_Result *VD_RESearchResults(struct stateStack *stack) {
    struct VD_RE_Result *result = NULL;
    if (stack->ptr > 1) {
        DODEBUG(stack->env & (E_FINALSTACK | E_STACK), dbStack(stack));
        stack->ptr = 1;
        result = results(stack, NULL);
        DODEBUG(stack->env & E_RESULTS, dbResults(result, 2));
    }
    freeREStack(stack);
    return result;
}

#ifdef STANDALONE
// tests have been moved to re-parse.c because they are associated
// with a particular syntax
extern char *tests[][3];

// Compares the string from a single RESULT node with the expected result
VD_UTF8 *compareOneResult(VD_UTF8 *p, VD_INT4 len, VD_UTF8 *expected) {
    VD_INT4 i;
    for (i = 0; i < len; i++) {
        if (*(p + i) != *expected++) {
            return NULL;
        }
    }
    return expected;
}

// Compares an entire RESULT tree from a successful search with the
// expected result
VD_UTF8 *compareResults(struct VD_RE_Result *results, VD_UTF8 *expected) {
    if (!results || !expected) {
        return NULL;
    }
    while (results) {
        if (!(expected = compareOneResult(results->start, results->len, expected))) {
            return NULL;
        }
        if (results->contains
            && ((*expected++ != '<') 
                || !(expected = compareResults(results->contains, expected))
                || (*expected++ != '>'))) {
            return NULL;
        }

        results = results->chain;
        if (results && (*expected++ != ',')) {
            return NULL;
        }
    }
    return expected;
}

// Runs all the self-tests
VD_INT4 runTests(VD_UINT4 env) {
    struct VD_RE_Result *results = NULL;
    struct stateStack stack;
    VD_INT4 i, failed = 0;
    VD_UTF8 *r = NULL;
    VD_UTF8 *x;
    env |= E_LOOKINGAT;
    for (i = 0; tests[i][0]; i++) {
        printf("%4d %20s %20s %30s...", i, tests[i][0], tests[i][1], (tests[i][2]) ? (char *) tests[i][2]: "NO MATCH");
        fflush(stdout);
        if (VD_RESearch(&stack, tests[i][0], strlen(tests[i][0]), tests[i][1], strlen(tests[i][1]), env) == VD_INTERRUPTED_CALL) {
            while (VD_RESearchContinue(&stack) == VD_INTERRUPTED_CALL);
        }
        results = VD_RESearchResults(&stack);
        r = "passed";
        if (results) {
            if (!(x = compareResults(results, tests[i][2])) || (*x)) {
                r = "failed";
//                dbResults(results, 5);
                failed++;
            }
            VD_FreeREResults(results);
        }
        else if (tests[i][2]) {
            r = "failed";
            failed++;
        }
        printf(" %s\n", r);
    }
    printf("Ran %d tests, %d succeeded, %d failed\n", i, i - failed, failed);
    return (failed != 0);
}

// Prints out usage message and exits
void help(VD_UTF8 *name) {
    printf("%s [-SWITCHES...] REGEXP STRING\n", name);
    printf("\t-x: run tests (add -l to run lazy tests)\n");
    printf("\t-a: match must start at the first character ('looking at')\n");
    printf("\t-A: search entire string for match (default)\n");
    printf("\t-c: ignore case in checking for match\n");
    printf("\t-C: compare case in checking for match (default)\n");
#ifdef DEBUG
    printf("\t-f: print out frames after successful search\n");
    printf("\t-F: print out frames after each iteration\n");
    printf("\t-t: print out the parse tree\n");
    printf("\t-r: print out results structure\n");
    printf("\t-s: print out statistics on finish\n");
    printf("\t-d: print out various debugging information\n");
    printf("\t-z: equivalent to -atrsdf\n");
#endif
    exit(1);
}

VD_INT4 getArgs(VD_INT4 argc, char **argv, VD_INT4 *penv) {
    VD_INT4 i, j, ret = 0;
    *penv = 0;
    for (i = 1; (i < argc) && (*argv[i] == '-'); i++) {
        for (j = 1; argv[i][j]; j++) {
            switch (argv[i][j]) {
            case 'x': *penv |= E_RUNTESTS; ret = 1; break;
            case 'a': *penv |= E_LOOKINGAT; break;
            case 'A': *penv &= ~E_LOOKINGAT; break;
            case 'C': *penv &= ~E_NOMATCHCASE; break;
            case 'c': *penv |= E_NOMATCHCASE; break;
#ifdef DEBUG
            case 'f': *penv |= E_FINALSTACK; break;
            case 'F': *penv |= E_STACK; break;
            case 't': *penv |= E_TREE; break;
            case 'r': *penv |= E_RESULTS; break;
            case 's': *penv |= E_STATS; break;
            case 'd': *penv |= E_SHOWDEBUG; break;
            case 'z': *penv |= (E_TREE | E_RESULTS | E_SHOWDEBUG | E_STATS | E_LOOKINGAT | E_FINALSTACK); break;
#endif
            default: return -1;
            }
        }
    }
    if (ret) {
        return ret;
    }
    if (argc == i) {
        *penv |= E_INTERACTIVE;
        return 0;
    }
    return (argc - i == 2) ? i : -1;
}

char *stripEOL(char *p) {
    char *p1 = p;
    if (p) {
        while (*p++);
        *(p - 2) = 0;
    }
    return  p1;
}

VD_INT4 interactive(VD_UINT4 env) {
    char regexp[RE_SIZE + 1];
    char string[STRING_SIZE + 1];
    struct VD_RE_Result *results = NULL;
    struct stateStack stack;
    do {
        printf("Enter regexp: ");
        fflush(stdout);
        stripEOL(fgets(regexp, RE_SIZE, stdin));
        if (regexp[0]) {
            printf("Enter string: ");
            fflush(stdout);
            stripEOL(fgets(string, STRING_SIZE, stdin));
            if (VD_RESearch(&stack, regexp, strlen(regexp), string, strlen(string), env) == VD_INTERRUPTED_CALL) {
                while (VD_RESearchContinue(&stack) == VD_INTERRUPTED_CALL);
            }
            results = VD_RESearchResults(&stack);
            if (!results) {
                printf("Search failed\n");
            }
            else {
                printf("SEARCH SUCCEEDED, %.*s\n", results->len, results->start);
                VD_FreeREResults(results);
            }
        }
    }
    while (regexp[0]);
    return 0;
}

int main(VD_INT4 argc, char **argv) {
    struct stateStack stack;
    struct VD_RE_Result *results = NULL;
    VD_INT4 i = 0;
    VD_UINT4 env = 0;
    VD_UTF8 *re = NULL, *str = NULL;

    if ((i = getArgs(argc, argv, &env)) < 0) {
        help(argv[0]);
    }
    if (env & E_RUNTESTS) {
        i = runTests(env);
    }
    else if (env & E_INTERACTIVE) {
        i = interactive(env);
    }
    else {
        re = argv[i++];
        str = argv[i];
// Below here is the template code for vd to use 
        if ((i = VD_RESearch(&stack, re, strlen(re), str, strlen(str), env)) == VD_INTERRUPTED_CALL) {
            while ((i = VD_RESearchContinue(&stack)) == VD_INTERRUPTED_CALL);
        }
        results = VD_RESearchResults(&stack);
// above here is the template code for vd to use 
        if (!results) {
            printf("Search failed\n");
            i = 1;
        }
        else {
            printf("SEARCH SUCCEEDED, %.*s\n", results->len, results->start);
            VD_FreeREResults(results);
            i = 0;
        }
    }
    return i;
} 

#endif

