filter provides a finite state machine filter stateFilter for processing lines during input and output by file.

filter.h

Copyright © 2003, 2004 Dave Bayer. Subject to the terms and conditions of the MIT License.

NCHARS

stateFilter looks up text by state and initial character; NCHARS gives the number of characters in the character set, for use as an array bound.

#define NCHARS 256

filterRules

Filter rules are defined as null-terminated filterRules arrays. If the current state is fromState and the input text at the current input index begins with the prefix fromStr, then this prefix is replaced by toStr in the output text, and the current state becomes toState. If mark is nonzero, then the output index of the start of toStr is saved in a filterMark struct, tagged with mark. The input and output indices are advanced to the first character after the corresponding prefixes.

The empty string is allowed as either a fromState or a toState prefix. The longest string that matches takes priority, in the case of multiple matches. In particular, the empty string matches with lowest priority.

If there is no match, then the filt routine specified in filterState is called. If filt is null, then the current character is copied from the input text to the output text, the indices are incremented, and the current state stays the same.

The lengths of fromStr and toStr are cached in fromLen and toLen, and next is used to reorganize filter rules as an array of linked lists.

typedef struct filterRules
{
    char *fromStr, *toStr;
    unsigned fromState, toState, mark;
    unsigned fromLen, toLen;
    struct filterRules *next;
} filterRules;

filterMark

stateFilter will optionally mark locations in a line, by attaching a linked filterMark list to the data field of the line. mark identifies the type of mark, and index gives its location in the line. next implements the linked list.

typedef struct filterMark
{
    unsigned mark, index;
    struct filterMark *next;
} filterMark;

filterTable

Filter rules are organized as linked lists in a filterTable array, of length equal to the number of states.

typedef filterRules *filterTable[NCHARS];

filterState

filterState is passed via the data field of a lineFilter to stateFilter, to determine the current filterTable array and the current state.

filt, if not null, is called by stateFilter when no rule applies. filt is passed handles to the input and output strings, and the current state. A filt routine returns the new state and advances the string pointers. It must make progress, to avoid putting stateFilter in an infinite loop.

typedef struct filterState
{
    filterTable *table;
    unsigned state;
    unsigned  (*filt)( char **ps, char **pt, unsigned state );
} filterState;

Function prototypes:

void markFree( void *data );
void initFilterTable( filterTable *table, filterRules *rules, unsigned nstates, BOOL clear );
void stateFilter( line *buf1, line *buf2, void *data );

#ifdef TESTCODE
void filterTest( void );
#endif

filter.c

Copyright © 2003, 2004 Dave Bayer. Subject to the terms and conditions of the MIT License.

#include "root.h"
#include "file.h"
#include "filter.h"

markAlloc

markAlloc allocates initialized filterMark structs. It is static because only stateFilter creates marks.

static filterMark *markAlloc( void )
{
    filterMark *p;

    p = malloc( sizeof(*p));
    p->mark = 0;
    p->index = 0;
    p->next = 0;
    return p;
}

markFree

markFree frees linked filterMark lists. It is public, of type agreeing with lineFreeFn, so that it can be passed to lineFree, lineListFree, lineReset, and lineDelete.

void markFree( void *data )
{
    filterMark *p, *q;

    p = data;
    while ( p != 0 )
    {
        q = p->next;
        free( p );
        p = q;
    }
}

initFilterTable

initFilterTable processes a null-terminated filterRules array rules, sorting it into linked lists in the provided filterTable array table. nstates is the number of states in rules. States are assumed to be numbered 0 to nstates - 1; this is automatically the case if they are generated by an enum.

Because C strings are null terminated, this code automatically sorts empty fromStr prefixes into character index zero, where stateFilter will later look for them.

If clear is YES, then table is initialized with null pointers. Otherwise, multiple calls can add several sets of rules to the same table, allowing reuse of rule subsets.

void initFilterTable( filterTable *table, filterRules *rules, unsigned nstates, BOOL clear )
{
    int i, j;
    unsigned len;
    filterRules *p, **pp;

    if ( clear )
        for ( i=0; i<nstates; ++i )
            for ( j=0; j<NCHARS; ++j )
                table[i][j] = 0;
    for ( p=rules; p->fromStr!=0; ++p )
    {
        if ( p->toStr == 0 ) p->toStr = p->fromStr;
        p->fromLen = len = strlen( p->fromStr );
        p->toLen = strlen( p->toStr );
        p->next = 0;

Sort linked lists by longest fromStr first, to implement longest match. The code for stateFilter will apply the first rule it finds that matches.

        pp = &table[p->fromState][(int) *p->fromStr];
        while ( *pp != 0 && (*pp)->fromLen > len ) pp = &(*pp)->next;
        p->next = *pp;
        *pp = p;
    }
}

stateFilter

stateFilter copies the line buf1 to buf2, applying the rules initialized by initFilterTable.

void stateFilter( line *buf1, line *buf2, void *data )
{
    char *s, *t;
    filterRules *p;
    filterMark *q, **qq;
    filterState *state;

    state = data;
    qq = (filterMark **) &buf2->data;
    for ( s=buf1->s, t=buf2->s; *s!='\0'; )
    {

First, try to match a nonempty prefix against the rest of the line.

        for ( p=state->table[state->state][(int) *s]; p!=0; p=p->next )
            if ( strncmp( s, p->fromStr, p->fromLen ) == 0 )
                break;

If there was no match, try to match an empty prefix. If this ruleTable entry is nonempty, it points to a matching rule.

        if ( p == 0 ) p = state->table[state->state][0];

If a matching rule has been found, apply it.

        if ( p != 0 )
        {
            strncpy( t, p->toStr, p->toLen );
            state->state = p->toState;
            if ( p->mark != 0 )
            {
                q = markAlloc();
                q->mark = p->mark;
                q->index = t - buf2->s;
                *qq = q;
                qq = &q->next;
            }
            s += p->fromLen;
            t += p->toLen;
        }

Otherwise, call filt if not null to update the string pointers and the current state. If filt is null, copy one character without changing the state.

        else if ( state->filt != 0 )
            state->state = state->filt( &s, &t, state->state );
        else
            *t++ = *s++;
    }
    *t = '\0';
    buf2->len = t - buf2->s;
}

Test code:

#ifdef TESTCODE

Define test rules for use with filterTest. These rules find quoted phrases, remove the quotes, and enclose the phrases in the HTML <samp> element. Escaped quotes \" are output as quotes. The first nonblank character of each quoted phrase is marked with testPhrase, and the corresponding closing quote position is marked testQuote.

Note the use of an empty fromStr prefix to match by default the first nonblank character within a quote.

typedef enum { testStart, testQuote, testPhrase, testNStates } testState;

static filterRules *testTable[testNStates][NCHARS];

static filterRules testRules[] =
{
    { "\"", "<samp>", testStart, testQuote, 0 },
    { "\"", "</samp>", testQuote, testStart, 0 },
    { "\"", "</samp>", testPhrase, testStart, testQuote },

    { " ", 0, testQuote, testQuote, 0 },
    { "\t", 0, testQuote, testQuote, 0 },
    { "", 0, testQuote, testPhrase, testPhrase },

    { "\\\"", "\"", testStart, testStart, 0 },
    { "\\\"", "\"", testPhrase, testPhrase, 0 },

    { 0 }
};

testFilt

testFilt replaces 'a' by 'A', in any state.

static BOOL testFilt( char **ps, char **pt, unsigned state )
{
    if ( **ps == 'a' )
    {
        **pt = 'A';
        (*ps)++;
        (*pt)++;
    }
    else
        *(*pt)++ = *(*ps)++;
    return state;
}

filterTest

filterTest writes the text file TempFilterTest, and reads it using fileRead. The text is filtered using the above test rules. The file is written to stdout, and each marked quoted phrase is written to stdout.

void filterTest( void )
{
    line *l, *m;
    filterMark *p;
    FILE *fin, *fout;
    lineFilter filter;
    filterState state;
    const char name[] = "TempFilterTest";
    const char text[] =
        "here is an escaped \\\" quote\n"
        "testing \" filter\" and the \" mark\" mechanism\n"
        "does an embedded \" \\\" escaped quote\" work?\n"
        ;

    fout = fileOpen( name, "w", YES );
    fputs( text, fout );
    fileClose( fout, YES );

    initFilterTable( testTable, testRules, testNStates, YES );
    filter.filt = stateFilter;
    filter.next = 0;
    filter.data = &state;
    state.table = testTable;
    state.state = testStart;
    state.filt = testFilt;

    fin = fileOpen( name, "r", YES );
    l = fileRead( fin, &filter, 0 );
    fileWrite( stdout, l, 0 );

    putchar( '\n' );
    for( m=l; m!=0; m=m->next )
    {
        for ( p=m->data; p!=0; p=p->next )
            if ( p->mark == testQuote )
                m->s[p->index] = '\0';
        for ( p=m->data; p!=0; p=p->next )
            if ( p->mark == testPhrase )
                printf( "%s\n", m->s + p->index );
    }

    lineListFree( l, markFree );
    fileClose( fin, YES );
}

#endif