C code by k4st - 829 lines

Link:
k4st - C, pasted on Jun 27:
/*
 * regexp-parser.c
 *
 *  Created on: Jun 22, 2009
 *      Author: petergoodman
 *     Version: $Id$
 */

#include <p-regexp.h>

static char buffer[3],
            curr_char;

static unsigned int line = 0,
                    column = 0,
                    in_char_class = 0,
                    first_char_in_class = 1,
                    got_char = 0;

static FILE *fp = NULL;

/* grammar terminals */
enum {
    L_START_GROUP,
    L_END_GROUP,
    L_START_CLASS,
    L_START_NEG_CLASS,
    L_END_CLASS,
    L_POSITIVE_CLOSURE,
    L_KLEENE_CLOSURE,
    L_OPTIONAL,
    L_CARROT,
    L_ANCHOR_START,
    L_ANCHOR_END,
    L_OR,
    L_CHARACTER_RANGE,
    L_CHARACTER,
    L_SPACE,
    L_NEW_LINE,
    L_CARRIAGE_RETURN,
    L_TAB,
    L_ANY_CHAR
};

/* grammar non-terminals */
enum {
    P_MACHINE,
    P_EXPR,
    P_CAT_EXPR,
    P_FACTOR,
    P_TERM,
    P_OR_EXPR,
    P_STRING,
    P_CHAR,
    P_CHAR_CLASS_STRING,
    P_OPT_CHAR_CLASS_STRING,
    P_CHAR_RANGE,
    P_CHARACTER_CLASS,
    P_NEGATED_CHARACTER_CLASS,
    P_KLEENE_CLOSURE,
    P_POSITIVE_CLOSURE,
    P_OPTIONAL_TERM
};

/* data structure holding information to perform Thompson's construction while
 * the parse tree is being traversed. */
typedef struct PThompsonsConstruction {
    PNFA *nfa;
    unsigned int state_stack[256];
    int top_state;
} PThompsonsConstruction;

/**
 * The top-level non-terminal action. This ties all of the information together
 * for the NFA.
 */
static void Machine(PThompsonsConstruction *thompson,
                 unsigned char phrase,
                 unsigned int num_branches,
                 PParseTree *branches[]) {

    unsigned int start, end;
    PT_Terminal *term;
    PParseTree *tree;

    switch(num_branches) {
        /* ... */
        case 1:
            if(branches[0]->type != PT_NON_TERMINAL) {
                return;
            }
            start = thompson->state_stack[thompson->top_state--];
            end = thompson->state_stack[thompson->top_state--];
            break;

        case 2:
            /* ^ ... */
            if(branches[0]->type == PT_TERMINAL) {
                start = nfa_add_state(thompson->nfa);

                /* ^ ... */
                if(branches[1]->type != PT_TERMINAL) {
                    nfa_add_value_transition(
                         thompson->nfa,
                         start,
                         thompson->state_stack[thompson->top_state--],
                         '\n'
                    );
                    end = thompson->state_stack[thompson->top_state--];

                /* ^$ */
                } else {
                    end = nfa_add_state(thompson->nfa);
                    nfa_add_value_transition(thompson->nfa, start, end, '\n');
                }
            /* ... $ */
            } else {
                start = thompson->state_stack[thompson->top_state--];
                end = nfa_add_state(thompson->nfa);
                nfa_add_value_transition(
                     thompson->nfa,
                     thompson->state_stack[thompson->top_state--],
                     end,
                     '\n'
                );
            }
            break;

        /* ^ ... $ */
        case 3:
            start = nfa_add_state(thompson->nfa);
            end = nfa_add_state(thompson->nfa);
            nfa_add_value_transition(
                 thompson->nfa,
                 start,
                 thompson->state_stack[thompson->top_state--],
                 '\n'
            );
            nfa_add_value_transition(
                 thompson->nfa,
                 thompson->state_stack[thompson->top_state--],
                 end,
                 '\n'
            );
            break;
        default:
            start = nfa_add_state(thompson->nfa);
            end = start;
            break;
    }

    nfa_change_start_state(thompson->nfa, start);
    nfa_add_accepting_state(thompson->nfa, end);
}

/**
 * For two sub-expressions A and B, AB is turned into the following structure:
 *
 * >--(A)--()--(B)-->
 *
 * Unfortunately this adds in extra epsilon transitions to connect various
 * NFA sub-machines. The construction of NFAs is such that it would be
 * extremely annoying to attempt to merge two states.
 */
static void CatExpr(PThompsonsConstruction *thompson,
                 unsigned char phrase,
                 unsigned int num_branches,
                 PParseTree *branches[]) {

    unsigned int start, prev, end, inter_start, inter_end;
    PT_Terminal *character;

    end = thompson->state_stack[thompson->top_state - 1];

    if(num_branches > 1) {
        for(; --num_branches; ) {
            inter_start = thompson->state_stack[thompson->top_state];
            inter_end = thompson->state_stack[thompson->top_state - 3];

            nfa_add_epsilon_transition(
                thompson->nfa,
                inter_end,
                inter_start
            );

            thompson->top_state -= 2;
        }
    }

    thompson->state_stack[thompson->top_state - 1] = end;
}

/**
 * For two sub-expression A and B, A|B is turned into the following structure:
 *
 *             .-->--(A)-->--.
 * >--(start)--|             |--(end)-->
 *             `-->--(B)-->--'
 *
 */
static void OrExpr(PThompsonsConstruction *thompson,
                 unsigned char phrase,
                 unsigned int num_branches,
                 PParseTree *branches[]) {

    unsigned int a1_start, a1_end, a2_start, a2_end, start, end;

    a1_start = thompson->state_stack[thompson->top_state--];
    a1_end = thompson->state_stack[thompson->top_state--];
    a2_start = thompson->state_stack[thompson->top_state--];
    a2_end = thompson->state_stack[thompson->top_state--];

    start = nfa_add_state(thompson->nfa);
    end = nfa_add_state(thompson->nfa);

    nfa_add_epsilon_transition(thompson->nfa, start, a1_start);
    nfa_add_epsilon_transition(thompson->nfa, start, a2_start);

    nfa_add_epsilon_transition(thompson->nfa, a1_end, end);
    nfa_add_epsilon_transition(thompson->nfa, a2_end, end);

    thompson->state_stack[++thompson->top_state] = end;
    thompson->state_stack[++thompson->top_state] = start;
}

/**
 * For two characters A and B, AB is turned into the following structure:
 *
 * >--(A)-->--(B)-->
 *
 */
static void String(PThompsonsConstruction *thompson,
                 unsigned char phrase,
                 unsigned int num_branches,
                 PParseTree *branches[]) {

    unsigned int start, prev, end, i;
    PT_Terminal *character;

    start = nfa_add_state(thompson->nfa);
    prev = start;

    for(i = 0; i < num_branches; ++i) {
        character = (PT_Terminal *) branches[i];
        end = nfa_add_state(thompson->nfa);
        nfa_add_value_transition(
             thompson->nfa,
             prev,
             end,
             character->lexeme->str[0]
        );
        prev = end;
    }

    thompson->state_stack[++thompson->top_state] = end;
    thompson->state_stack[++thompson->top_state] = start;
}

typedef void (R_set_elm_fnc_t)(PSet *, unsigned int);

static void R_char_class(PThompsonsConstruction *thompson,
                         unsigned char phrase,
                         unsigned int num_branches,
                         PParseTree *branches[],
                         PSet *set,
                         R_set_elm_fnc_t *fnc) {

    unsigned int start, end, i;
    unsigned char range_start, range_end;
    PT_NonTerminal *range;

    for(i = 0; i < num_branches; ++i) {
        if(branches[i]->type == PT_NON_TERMINAL) {
            range = (PT_NonTerminal *) branches[i];
            range_start = ((PT_Terminal *) tree_get_branch(range, 0))->lexeme->str[0];
            range_end = ((PT_Terminal *) tree_get_branch(range, 1))->lexeme->str[0];

            for(; range_start <= range_end; ++range_start) {
                fnc(set, range_start);
            }
        } else {
            fnc(set,((PT_Terminal *) branches[i])->lexeme->str[0]);
        }
    }

    start = nfa_add_state(thompson->nfa);
    end = nfa_add_state(thompson->nfa);
    nfa_add_set_transition(thompson->nfa, start, end, set);

    thompson->state_stack[++thompson->top_state] = end;
    thompson->state_stack[++thompson->top_state] = start;
}

/**
 * For all characters c in some character class C, C is turned into the following
 * structure:
 *
 * >--({c1, c2, ..., cN})-->
 *
 */
static void CharClass(PThompsonsConstruction *thompson,
                 unsigned char phrase,
                 unsigned int num_branches,
                 PParseTree *branches[]) {
    R_char_class(
        thompson,
        phrase,
        num_branches,
        branches,
        set_alloc(),
        &set_add_elm
    );
}

/**
 * For all characters c in some negated character class C, C is turned into the
 * following structure:
 *
 * >--({c1, c2, ..., cN}^c)-->
 *
 */
static void NegatedCharClass(PThompsonsConstruction *thompson,
                 unsigned char phrase,
                 unsigned int num_branches,
                 PParseTree *branches[]) {
    R_char_class(
        thompson,
        phrase,
        num_branches,
        branches,
        set_alloc_inverted(),
        &set_remove_elm
    );
}

/**
 * For some sub-expression A, A* is turned into the following structure:
 *
 *              .-->-->--.
 * >--(start)--|--<-.    |--(end)-->
 *              `->-(A)--'
 *
 */
static void KleeneClosure(PThompsonsConstruction *thompson,
                 unsigned char phrase,
                 unsigned int num_branches,
                 PParseTree *branches[]) {
    unsigned int c_start, c_end, start, end, loop_start, loop_end;

    loop_start = thompson->state_stack[thompson->top_state--];
    loop_end = thompson->state_stack[thompson->top_state--];

    start = nfa_add_state(thompson->nfa);
    end = nfa_add_state(thompson->nfa);

    nfa_add_epsilon_transition(thompson->nfa, start, loop_start);
    nfa_add_epsilon_transition(thompson->nfa, start, end);
    nfa_add_epsilon_transition(thompson->nfa, loop_end, end);
    nfa_add_epsilon_transition(thompson->nfa, loop_end, loop_start);

    thompson->state_stack[++thompson->top_state] = end;
    thompson->state_stack[++thompson->top_state] = start;
}

/**
 * For some sub-expression A, A+ is turned into the following structure:
 *
 * >--(start)-->--<-.    .--(end)-->
 *              `->-(A)--'
 *
 */
static void PositiveClosure(PThompsonsConstruction *thompson,
                 unsigned char phrase,
                 unsigned int num_branches,
                 PParseTree *branches[]) {
    unsigned int c_start, c_end, start, end, loop_start, loop_end;

    loop_start = thompson->state_stack[thompson->top_state--];
    loop_end = thompson->state_stack[thompson->top_state--];

    start = nfa_add_state(thompson->nfa);
    end = nfa_add_state(thompson->nfa);

    nfa_add_epsilon_transition(thompson->nfa, start, loop_start);
    nfa_add_epsilon_transition(thompson->nfa, loop_end, end);
    nfa_add_epsilon_transition(thompson->nfa, loop_end, loop_start);

    thompson->state_stack[++thompson->top_state] = end;
    thompson->state_stack[++thompson->top_state] = start;
}

/**
 * For some sub-expression A, A? is turned into the following structure:
 *
 * >--(start)-->-->--->--.--(end)-->
 *              `->-(A)--'
 *
 */
static void OptionalTerm(PThompsonsConstruction *thompson,
                 unsigned char phrase,
                 unsigned int num_branches,
                 PParseTree *branches[]) {

    unsigned int c_start, c_end, start, end, loop_start, loop_end;

    loop_start = thompson->state_stack[thompson->top_state--];
    loop_end = thompson->state_stack[thompson->top_state--];

    start = nfa_add_state(thompson->nfa);
    end = nfa_add_state(thompson->nfa);

    nfa_add_epsilon_transition(thompson->nfa, start, loop_start);
    nfa_add_epsilon_transition(thompson->nfa, start, end);
    nfa_add_epsilon_transition(thompson->nfa, loop_end, end);

    thompson->state_stack[++thompson->top_state] = end;
    thompson->state_stack[++thompson->top_state] = start;
}

/**
 * Scan the input letter-by-letter until a lexeme is matched. Stuff the lexeme
 * into the 'token' variable along with any other information such as
 * line/column number, etc.
 */
static int R_get_token(PScanner *scanner, PToken *token) {

    do {
        ++column;
        curr_char = fgetc(fp);

        if(curr_char == '\n') {
            ++line;
            column = 0;
        }
    } while(isspace(curr_char));

    if(feof(fp)) {
        return 0;
    }

    token->lexeme_length = 1;
    token->line = line;
    token->column = column;
    token->lexeme = buffer;

    buffer[1] = 0;
    buffer[0] = curr_char;

    if(in_char_class) {
        goto any_char;
    }

    switch(curr_char) {
        case '(': token->terminal = L_START_GROUP; break;
        case ')': token->terminal = L_END_GROUP; break;
        case '[':
            token->terminal = L_START_CLASS;
            in_char_class = 1;
            first_char_in_class = 1;

            curr_char = fgetc(fp);

            got_char = 1;

            ++column;
            if(feof(fp)) {
                return 0;
            }

            if(curr_char == '^') {
                buffer[0] = '[';
                buffer[1] = '^';
                buffer[2] = 0;
                token->terminal = L_START_NEG_CLASS;
            } else {
                ungetc(curr_char, fp);
            }

            break;
        case '+': token->terminal = L_POSITIVE_CLOSURE; break;
        case '*': token->terminal = L_KLEENE_CLOSURE; break;
        case '^': token->terminal = L_ANCHOR_START; break;
        case '$': token->terminal = L_ANCHOR_END; break;
        case '?': token->terminal = L_OPTIONAL; break;
        case '|': token->terminal = L_OR; break;
        case '.': token->terminal = L_ANY_CHAR; break;
        default:
any_char:
            switch(curr_char) {

                case ']':
                    token->terminal = L_END_CLASS; in_char_class = 0;
                    break;

                case '\\':
                    curr_char = fgetc(fp);
                    ++column;
                    if(feof(fp)) {
                        return 0;
                    }

                    buffer[0] = curr_char;

                    switch(curr_char) {
                        case '-': token->terminal = L_CHARACTER; break;
                        case 'n': token->terminal = L_NEW_LINE; break;
                        case 's': token->terminal = L_SPACE; break;
                        case 't': token->terminal = L_TAB; break;
                        case 'r': token->terminal = L_CARRIAGE_RETURN; break;
                        default:
                            if(isspace(curr_char)) {
                                printf(
                                    "Scanner Error: a space cannot follow a "
                                    "backslash.\n"
                                );
                                exit(1);
                            }

                            goto all_chars;
                    }
                    break;

                default:
                    if(in_char_class && curr_char == '-') {
                        token->terminal = L_CHARACTER_RANGE;
                    } else {
all_chars:
                        token->terminal = L_CHARACTER;
                    }
            }

            first_char_in_class = 0;
    }

    return 1;
}

/**
 * Construct the grammar for recognizing a regular expression.
 */
static PGrammar *regex_grammar(void) {

    PGrammar *G = grammar_alloc(
        P_MACHINE, /* production to start matching with */
        16, /* number of non-terminals */
        19, /* number of terminals */
        50, /* number of production phrases */
        80 /* number of phrase symbols */
    );

    /*
     * Machine
     *     : -<anchor_line_start> -<anchor_line_end>
     *     : -<anchor_line_start> ^Expr -<anchor_line_end>
     *     : -<anchor_line_start> ^Expr
     *     : ^Expr -<anchor_line_end>
     *     : ^Expr
     *     : <>
     *     ;
     */

    grammar_add_terminal_symbol(G, L_ANCHOR_START, G_NON_EXCLUDABLE);
    grammar_add_terminal_symbol(G, L_ANCHOR_END, G_NON_EXCLUDABLE);
    grammar_add_phrase(G);
    grammar_add_terminal_symbol(G, L_ANCHOR_START, G_NON_EXCLUDABLE);
    grammar_add_non_terminal_symbol(G, P_EXPR, G_RAISE_CHILDREN);
    grammar_add_terminal_symbol(G, L_ANCHOR_END, G_NON_EXCLUDABLE);
    grammar_add_phrase(G);
    grammar_add_terminal_symbol(G, L_ANCHOR_START, G_NON_EXCLUDABLE);
    grammar_add_non_terminal_symbol(G, P_EXPR, G_RAISE_CHILDREN);
    grammar_add_phrase(G);
    grammar_add_non_terminal_symbol(G, P_EXPR, G_RAISE_CHILDREN);
    grammar_add_terminal_symbol(G, L_ANCHOR_END, G_NON_EXCLUDABLE);
    grammar_add_phrase(G);
    grammar_add_non_terminal_symbol(G, P_EXPR, G_RAISE_CHILDREN);
    grammar_add_phrase(G);
    grammar_add_epsilon_symbol(G, G_AUTO);
    grammar_add_phrase(G);
    grammar_add_production_rule(G, P_MACHINE, (G_ProductionRuleFunc *) &Machine);


    /*
     * OrExpr
     *     : ^Expr <or> CatExpr
     *     ;
     */

    grammar_add_non_terminal_symbol(G, P_EXPR, G_RAISE_CHILDREN);
    grammar_add_terminal_symbol(G, L_OR, G_AUTO);
    grammar_add_non_terminal_symbol(G, P_CAT_EXPR, G_AUTO);
    grammar_add_phrase(G);
    grammar_add_production_rule(G, P_OR_EXPR, (G_ProductionRuleFunc *) &OrExpr);

    /*
     * Expr
     *     : OrExpr
     *     : CatExpr
     *     ;
     */

    grammar_add_non_terminal_symbol(G, P_OR_EXPR, G_AUTO);
    grammar_add_phrase(G);
    grammar_add_non_terminal_symbol(G, P_CAT_EXPR, G_AUTO);
    grammar_add_phrase(G);
    grammar_add_production_rule(G, P_EXPR, &grammar_null_action);

    /*
     * CatExpr
     *     : ^CatExpr ^Factor
     *     : ^Factor
     *     ;
     */

    grammar_add_non_terminal_symbol(G, P_CAT_EXPR, G_RAISE_CHILDREN);
    grammar_add_non_terminal_symbol(G, P_FACTOR, G_RAISE_CHILDREN);
    grammar_add_phrase(G);
    grammar_add_non_terminal_symbol(G, P_FACTOR, G_RAISE_CHILDREN);
    grammar_add_phrase(G);
    grammar_add_production_rule(G, P_CAT_EXPR, (G_ProductionRuleFunc *) &CatExpr);

    /*
     * KleeneClosure : ^Term <kleene_closure> ;
     */

    grammar_add_non_terminal_symbol(G, P_TERM, G_RAISE_CHILDREN);
    grammar_add_terminal_symbol(G, L_KLEENE_CLOSURE, G_AUTO);
    grammar_add_phrase(G);
    grammar_add_production_rule(G, P_KLEENE_CLOSURE, (G_ProductionRuleFunc *) &KleeneClosure);

    /*
     * PositiveClosure : ^Term <positive_closure> ;
     */

    grammar_add_non_terminal_symbol(G, P_TERM, G_RAISE_CHILDREN);
    grammar_add_terminal_symbol(G, L_POSITIVE_CLOSURE, G_AUTO);
    grammar_add_phrase(G);
    grammar_add_production_rule(G, P_POSITIVE_CLOSURE, (G_ProductionRuleFunc *) &PositiveClosure);

    /*
     * OptionalTerm : ^Term <optional> ;
     */

    grammar_add_non_terminal_symbol(G, P_TERM, G_RAISE_CHILDREN);
    grammar_add_terminal_symbol(G, L_OPTIONAL, G_AUTO);
    grammar_add_phrase(G);
    grammar_add_production_rule(G, P_OPTIONAL_TERM, (G_ProductionRuleFunc *) &OptionalTerm);

    /*
     * Factor
     *     : -KleeneClosure
     *     : -PositiveClosure
     *     : -OptionalTerm
     *     : ^Term
     *     ;
     */

    grammar_add_non_terminal_symbol(G, P_KLEENE_CLOSURE, G_NON_EXCLUDABLE);
    grammar_add_phrase(G);
    grammar_add_non_terminal_symbol(G, P_POSITIVE_CLOSURE, G_NON_EXCLUDABLE);
    grammar_add_phrase(G);
    grammar_add_non_terminal_symbol(G, P_OPTIONAL_TERM, G_NON_EXCLUDABLE);
    grammar_add_phrase(G);
    grammar_add_non_terminal_symbol(G, P_TERM, G_RAISE_CHILDREN);
    grammar_add_phrase(G);
    grammar_add_production_rule(G, P_FACTOR, &grammar_null_action);

    /*
     * Term
     *     : -NegatedCharacterClass
     *     : -CharacterClass
     *     : -String
     *     : <start_group> ^Expr <end_group>
     *     ;
     */

    grammar_add_non_terminal_symbol(G, P_NEGATED_CHARACTER_CLASS, G_NON_EXCLUDABLE);
    grammar_add_phrase(G);
    grammar_add_non_terminal_symbol(G, P_CHARACTER_CLASS, G_NON_EXCLUDABLE);
    grammar_add_phrase(G);
    grammar_add_non_terminal_symbol(G, P_STRING, G_NON_EXCLUDABLE);
    grammar_add_phrase(G);
    grammar_add_terminal_symbol(G, L_START_GROUP, G_AUTO);
    grammar_add_non_terminal_symbol(G, P_EXPR, G_RAISE_CHILDREN);
    grammar_add_terminal_symbol(G, L_END_GROUP, G_AUTO);
    grammar_add_phrase(G);
    grammar_add_production_rule(G, P_TERM, &grammar_null_action);

    /*
     * NegatedCharacterClass : <start_neg_class> ^CharClassString <end_class> ;
     */

    grammar_add_terminal_symbol(G, L_START_NEG_CLASS, G_AUTO);
    grammar_add_non_terminal_symbol(G, P_CHAR_CLASS_STRING, G_RAISE_CHILDREN);
    grammar_add_terminal_symbol(G, L_END_CLASS, G_AUTO);
    grammar_add_phrase(G);
    grammar_add_production_rule(G, P_NEGATED_CHARACTER_CLASS, (G_ProductionRuleFunc *) &NegatedCharClass);

    /*
     * CharRange
     *     : ^Char <character_range> ^Char
     *     ;
     */

    grammar_add_non_terminal_symbol(G, P_CHAR, G_RAISE_CHILDREN);
    grammar_add_terminal_symbol(G, L_CHARACTER_RANGE, G_AUTO);
    grammar_add_non_terminal_symbol(G, P_CHAR, G_RAISE_CHILDREN);
    grammar_add_phrase(G);
    grammar_add_production_rule(G, P_CHAR_RANGE, &grammar_null_action);

    /*
     * OptCharClassString
     *     : ^CharClassString
     *     : <>
     *     ;
     */

    grammar_add_non_terminal_symbol(G, P_CHAR_CLASS_STRING, G_RAISE_CHILDREN);
    grammar_add_phrase(G);
    grammar_add_epsilon_symbol(G, G_AUTO);
    grammar_add_phrase(G);
    grammar_add_production_rule(G, P_OPT_CHAR_CLASS_STRING, &grammar_null_action);

    /*
     * CharClassString
     *     : -CharRange ^OptCharClassString
     *     : ^Char ^CharClassString
     *     : ^Char
     *     ;
     */

    grammar_add_non_terminal_symbol(G, P_CHAR_RANGE, G_NON_EXCLUDABLE);
    grammar_add_non_terminal_symbol(G, P_OPT_CHAR_CLASS_STRING, G_RAISE_CHILDREN);
    grammar_add_phrase(G);
    grammar_add_non_terminal_symbol(G, P_CHAR, G_RAISE_CHILDREN);
    grammar_add_non_terminal_symbol(G, P_CHAR_CLASS_STRING, G_RAISE_CHILDREN);
    grammar_add_phrase(G);
    grammar_add_non_terminal_symbol(G, P_CHAR, G_RAISE_CHILDREN);
    grammar_add_phrase(G);
    grammar_add_production_rule(G, P_CHAR_CLASS_STRING, &grammar_null_action);

    /*
     * CharacterClass : <start_class> ^CharClassString <end_class> ;
     */

    grammar_add_terminal_symbol(G, L_START_CLASS, G_AUTO);
    grammar_add_non_terminal_symbol(G, P_CHAR_CLASS_STRING, G_RAISE_CHILDREN);
    grammar_add_terminal_symbol(G, L_END_CLASS, G_AUTO);
    grammar_add_phrase(G);
    grammar_add_production_rule(G, P_CHARACTER_CLASS, (G_ProductionRuleFunc *) &CharClass);

    /*
     * Char
     *     : -<characher>
     *     : -<any_char>
     *     : -<space>
     *     : -<tab>
     *     : -<new_line>
     *     : -<carriage_return>
     *     ;
     */

    grammar_add_terminal_symbol(G, L_CHARACTER, G_NON_EXCLUDABLE);
    grammar_add_phrase(G);
    grammar_add_terminal_symbol(G, L_ANY_CHAR, G_NON_EXCLUDABLE);
    grammar_add_phrase(G);
    grammar_add_terminal_symbol(G, L_SPACE, G_NON_EXCLUDABLE);
    grammar_add_phrase(G);
    grammar_add_terminal_symbol(G, L_TAB, G_NON_EXCLUDABLE);
    grammar_add_phrase(G);
    grammar_add_terminal_symbol(G, L_NEW_LINE, G_NON_EXCLUDABLE);
    grammar_add_phrase(G);
    grammar_add_terminal_symbol(G, L_CARRIAGE_RETURN, G_NON_EXCLUDABLE);
    grammar_add_phrase(G);
    grammar_add_production_rule(G, P_CHAR, &grammar_null_action);

    /*
     * String
     *     : ^Char ^String
     *     : ^Char
     *     ;
     */

    grammar_add_non_terminal_symbol(G, P_CHAR, G_RAISE_CHILDREN);
    grammar_add_non_terminal_symbol(G, P_STRING, G_RAISE_CHILDREN);
    grammar_add_phrase(G);
    grammar_add_non_terminal_symbol(G, P_CHAR, G_RAISE_CHILDREN);
    grammar_add_phrase(G);
    grammar_add_production_rule(G, P_STRING, (G_ProductionRuleFunc *) &String);

    return G;
}

/**
 * Parse a regular expression, turn it into a NFA using Thompson's construction,
 * then turn that NFA into a DFA using subset construction, then minimize the
 * DFA.
 */
void parse_regexp(const char *file) {

    PScanner *scanner = (PScanner *) 0x1; /* fake scanner */
    PGrammar *grammar = regex_grammar();
    PThompsonsConstruction thom, *thompson = &thom;

    if(is_not_null(fp = fopen(file, "r"))) {

        thompson->nfa = nfa_alloc();
        thompson->top_state = -1;

        parse_tokens(
            grammar,
            scanner,
            (PScannerFunction *) &R_get_token,
            (void *) thompson,
            TREE_TRAVERSE_POSTORDER
        );

        printf("\nFreeing memory.. \n");

        grammar_free(grammar);

        nfa_print_dot(thompson->nfa);

        nfa_free(thompson->nfa);

        fclose(fp);
    } else {
        printf("Error: Could not open file '%s'.", file);
    }
}
Create a new paste based on this one

Comments: