Logo Search packages:      
Sourcecode: pccts version File versions  Download package

rexpr.c

/*
 * This file contains code for
 *
 *      int rexpr(char *expr, char *s);
 *
 * which answers
 *
 *      1 if 's' is in the language described by the regular expression 'expr'
 *      0 if it is not
 *     -1 if the regular expression is invalid
 *
 * Language membership is determined by constructing a non-deterministic
 * finite automata (NFA) from the regular expression.  A depth-
 * first-search is performed on the NFA (graph) to check for a match of 's'.
 * Each non-epsilon arc consumes one character from 's'.  Backtracking is
 * performed to check all possible paths through the NFA.
 *
 * Regular expressions follow the meta-language:
 *
 * <regExpr>        ::= <andExpr> ( '|' <andExpr> )*
 *
 * <andExpr>        ::= <expr> ( <expr> )*
 *
 * <expr>           ::= {'~'} '[' <atomList> ']' <repeatSymbol>
 *                      | '(' <regExpr> ')' <repeatSymbol>
 *                      | '{' <regExpr> '}' <repeatSymbol>
 *                      | <atom> <repeatSymbol>
 *
 * <repeatSymbol>   ::= { '*' | '+' }
 *
 * <atomList>       ::= <atom> ( <atom> )*
 *                      | { <atomList> } <atom> '-' <atom> { <atomList> }
 *
 * <atom>           ::= Token[Atom]
 *
 * Notes:
 *          ~     means complement the set in [..]. i.e. all characters not listed
 *          *     means match 0 or more times (can be on expression or atom)
 *          +     means match 1 or more times (can be on expression or atom)
 *          {}    optional
 *          ()    grouping
 *          []    set of atoms
 *          x-y   all characters from x to y (found only in [..])
 *          \xx the character with value xx
 *
 * Examples:
 *          [a-z]+
 *                match 1 or more lower-case letters (e.g. variable)
 *
 *          0x[0-9A-Fa-f]+
 *                match a hex number with 0x on front (e.g. 0xA1FF)
 *
 *          [0-9]+.[0-9]+{e[0-9]+}
 *                match a floating point number (e.g. 3.14e21)
 *
 * Code example:
 *          if ( rexpr("[a-zA-Z][a-zA-Z0-9]+", str) ) then str is keyword
 *
 * Terence Parr
 * Purdue University
 * April 1991
 */

#include <stdio.h>
#include <ctype.h>
#ifdef __STDC__
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include "rexpr.h"

#ifdef __USE_PROTOS
static int regExpr( GraphPtr g );
static int andExpr( GraphPtr g );
static int expr( GraphPtr g );
static int repeatSymbol( GraphPtr g );
static int atomList( char *p, int complement );
static void next( void );
static ArcPtr newGraphArc( void );
static NodePtr newNode( void );
static int ArcBetweenGraphNode( NodePtr i, NodePtr j, int label );
static Graph BuildNFA_atom( int label );
static Graph BuildNFA_AB( Graph A, Graph B );
static Graph BuildNFA_AorB( Graph A, Graph B );
static Graph BuildNFA_set( char *s );
static Graph BuildNFA_Astar( Graph A );
static Graph BuildNFA_Aplus( Graph A );
static Graph BuildNFA_Aoptional( Graph A );
#else
static int regExpr();
static int andExpr();
static int expr();
static int repeatSymbol();
static int atomList();
static void next();
static ArcPtr newGraphArc();
static NodePtr newNode();
static int ArcBetweenGraphNode();
static Graph BuildNFA_atom();
static Graph BuildNFA_AB();
static Graph BuildNFA_AorB();
static Graph BuildNFA_set();
static Graph BuildNFA_Astar();
static Graph BuildNFA_Aplus();
static Graph BuildNFA_Aoptional();
#endif

static char *_c;
static int token, tokchar;
static NodePtr accept;
static NodePtr freelist = NULL;

/*
 * return 1 if s in language described by expr
 *        0 if s is not
 *       -1 if expr is an invalid regular expression
 */
#ifdef __USE_PROTOS
static int rexpr(char *expr,char *s)
#else
static int rexpr(expr, s)
char *expr, *s;
#endif
{
      NodePtr p,q;
      Graph nfa;
      int result;

      fprintf(stderr, "rexpr(%s,%s);\n", expr,s);
      freelist = NULL;
      _c = expr;
      next();
      if ( regExpr(&nfa) == -1 ) return -1;
      accept = nfa.right;
      result = match(nfa.left, s);
      /* free all your memory */
      p = q = freelist;
      while ( p!=NULL ) { q = p->track; free(p); p = q; }
      return result;
}

/*
 * do a depth-first-search on the NFA looking for a path from start to
 * accept state labelled with the characters of 's'.
 */

#ifdef __USE_PROTOS
static int match(NodePtr automaton,char *s)
#else
static int match(automaton, s)
NodePtr automaton;
char *s;
#endif
{
      ArcPtr p;
      
      if ( automaton == accept && *s == '\0' ) return 1;    /* match */

      for (p=automaton->arcs; p!=NULL; p=p->next)                 /* try all arcs */
      {
            if ( p->label == Epsilon )
            {
                  if ( match(p->target, s) ) return 1;
            }
            else if ( p->label == *s )
                        if ( match(p->target, s+1) ) return 1;
      }
      return 0;
}

/*
 * <regExpr>        ::= <andExpr> ( '|' {<andExpr>} )*
 *
 * Return -1 if syntax error
 * Return  0 if none found
 * Return  1 if a regExrp was found
 */

#ifdef __USE_PROTOS
static int regExpr(GraphPtr g)
#else
static int regExpr(g)
GraphPtr g;
#endif
{
      Graph g1, g2;
      
      if ( andExpr(&g1) == -1 )
      {
            return -1;
      }
      
      while ( token == '|' )
      {
            int a;
            next();
            a = andExpr(&g2);
            if ( a == -1 ) return -1;     /* syntax error below */
            else if ( !a ) return 1;      /* empty alternative */
            g1 = BuildNFA_AorB(g1, g2);
      }
      
      if ( token!='\0' ) return -1;

      *g = g1;
      return 1;
}

/*
 * <andExpr>        ::= <expr> ( <expr> )*
 */

#ifdef __USE_PROTOS
static int andExpr(GraphPtr g)
#else
static int andExpr(g)
GraphPtr g;
#endif
{
      Graph g1, g2;
      
      if ( expr(&g1) == -1 )
      {
            return -1;
      }
      
      while ( token==Atom || token=='{' || token=='(' || token=='~' || token=='[' )
      {
            if (expr(&g2) == -1) return -1;
            g1 = BuildNFA_AB(g1, g2);
      }
      
      *g = g1;
      return 1;
}

/*
 * <expr>           ::=    {'~'} '[' <atomList> ']' <repeatSymbol>
 *                      | '(' <regExpr> ')' <repeatSymbol>
 *                      | '{' <regExpr> '}' <repeatSymbol>
 *                      | <atom> <repeatSymbol>
 */

#ifdef __USE_PROTOS
static int expr(GraphPtr g)
#else
static int expr(g)
GraphPtr g;
#endif
{
      int complement = 0;
      char s[257];    /* alloc space for string of char in [] */
      
      if ( token == '~' || token == '[' )
      {
            if ( token == '~' ) {complement = 1; next();}
            if ( token != '[' ) return -1;
            next();
            if ( atomList( s, complement ) == -1 ) return -1;
            *g = BuildNFA_set( s );
            if ( token != ']' ) return -1;
            next();
            repeatSymbol( g );
            return 1;
      }
      if ( token == '(' )
      {
            next();
            if ( regExpr( g ) == -1 ) return -1;
            if ( token != ')' ) return -1;
            next();
            repeatSymbol( g );
            return 1;
      }
      if ( token == '{' )
      {
            next();
            if ( regExpr( g ) == -1 ) return -1;
            if ( token != '}' ) return -1;
            next();
            /* S p e c i a l  C a s e   O p t i o n a l  {  } */
            if ( token != '*' && token != '+' )
            {
                  *g = BuildNFA_Aoptional( *g );
            }
            repeatSymbol( g );
            return 1;
      }
      if ( token == Atom )
      {
            *g = BuildNFA_atom( tokchar );
            next();
            repeatSymbol( g );
            return 1;
      }
      
      return -1;
}

/*
 * <repeatSymbol>   ::= { '*' | '+' }
 */
#ifdef __USE_PROTOS
static int repeatSymbol(GraphPtr g)
#else
static int repeatSymbol(g)
GraphPtr g;
#endif
{
      switch ( token )
      {
            case '*' : *g = BuildNFA_Astar( *g ); next(); break;
            case '+' : *g = BuildNFA_Aplus( *g ); next(); break;
      }
      return 1;
}

/*
 * <atomList>       ::= <atom> { <atom> }*
 *                      { <atomList> } <atom> '-' <atom> { <atomList> }
 *
 * a-b is same as ab
 * q-a is same as q
 */

#ifdef __USE_PROTOS
static int atomList(char *p, int complement)
#else
static int atomList(p, complement)
char *p;
int complement;
#endif
{
      static unsigned char set[256];            /* no duplicates */
      int first, last, i;
      char *s = p;
      
      if ( token != Atom ) return -1;
      
      for (i=0; i<256; i++) set[i] = 0;
      while ( token == Atom )
      {
            if ( !set[tokchar] ) *s++ = tokchar;
            set[tokchar] = 1;                   /* Add atom to set */
            next();
            if ( token == '-' )           /* have we found '-' */
            {
                  first = *(s-1);             /* Get last char */
                  next();
                  if ( token != Atom ) return -1;
                  else
                  {
                        last = tokchar;
                  }
                  for (i = first+1; i <= last; i++)
                  {
                        if ( !set[tokchar] ) *s++ = i;
                        set[i] = 1;                   /* Add atom to set */
                  }
                  next();
            }
      }
      *s = '\0';
      if ( complement )
      {
            for (i=0; i<256; i++) set[i] = !set[i];
            for (i=1,s=p; i<256; i++) if ( set[i] ) *s++ = i;
            *s = '\0';
      }
      return 1;
}

/* a somewhat stupid lexical analyzer */

#ifdef __USE_PROTOS
static void next(void)
#else
static void next()
#endif
{
      while ( *_c==' ' || *_c=='\t' || *_c=='\n' ) _c++;
      if ( *_c=='\\' )
      {
            _c++;
            if ( isdigit(*_c) )
            {
                  int n=0;
                  while ( isdigit(*_c) )
                  {
                        n = n*10 + (*_c++ - '0');
                  }
                  if ( n>255 ) n=255;
                  tokchar = n;
            }
            else
            {
                  switch (*_c)
                  {
                        case 'n' : tokchar = '\n'; break;
                        case 't' : tokchar = '\t'; break;
                        case 'r' : tokchar = '\r'; break;
                        default  : tokchar = *_c;
                  }
                  _c++;
            }
            token = Atom;
      }
      else if ( isgraph(*_c) && *_c!='[' && *_c!='(' && *_c!='{' &&
                    *_c!='-' && *_c!='}' && *_c!=')' && *_c!=']' &&
                    *_c!='+' && *_c!='*' && *_c!='~' && *_c!='|' )
      {
            token = Atom;
            tokchar = *_c++;
      }
      else
      {
            token = tokchar = *_c++;
      }
}

/* N F A  B u i l d i n g  R o u t i n e s */

#ifdef __USE_PROTOS
static ArcPtr newGraphArc(void)
#else
static ArcPtr newGraphArc()
#endif
{
      ArcPtr p;
      p = (ArcPtr) calloc(1, sizeof(Arc));
      if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);}
      if ( freelist != NULL ) p->track = (ArcPtr) freelist;
      freelist = (NodePtr) p;
      return p;
}

#ifdef __USE_PROTOS
static NodePtr newNode(void)
#else
static NodePtr newNode()
#endif
{
      NodePtr p;
      p = (NodePtr) calloc(1, sizeof(Node));
      if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);}
      if ( freelist != NULL ) p->track = freelist;
      freelist = p;
      return p;
}

#ifdef __USE_PROTOS
static void ArcBetweenGraphNodes(NodePtr i,NodePtr j,int label)
#else
static void ArcBetweenGraphNodes(i, j, label)
NodePtr i, j;
int label;
#endif
{
      ArcPtr a;
      
      a = newGraphArc();
      if ( i->arcs == NULL ) i->arctail = i->arcs = a;
      else {(i->arctail)->next = a; i->arctail = a;}
      a->label = label;
      a->target = j;
}

#ifdef __USE_PROTOS
static Graph BuildNFA_atom(int label)
#else
static Graph BuildNFA_atom(label)
int label;
#endif
{
      Graph g;
      
      g.left = newNode();
      g.right = newNode();
      ArcBetweenGraphNodes(g.left, g.right, label);
      return( g );
}

#ifdef __USE_PROTOS
static Graph BuildNFA_AB(Graph A,Graph B)
#else
static Graph BuildNFA_AB(A, B)
Graph A, B;
#endif
{
      Graph g;
      
      ArcBetweenGraphNodes(A.right, B.left, Epsilon);
      g.left = A.left;
      g.right = B.right;
      return( g );
}

#ifdef __USE_PROTOS
static Graph BuildNFA_AorB(Graph A,Graph B)
#else
static Graph BuildNFA_AorB(A, B)
Graph A, B;
#endif
{
      Graph g;
      
      g.left = newNode();
      ArcBetweenGraphNodes(g.left, A.left, Epsilon);
      ArcBetweenGraphNodes(g.left, B.left, Epsilon);
      g.right = newNode();
      ArcBetweenGraphNodes(A.right, g.right, Epsilon);
      ArcBetweenGraphNodes(B.right, g.right, Epsilon);
      return( g );
}

#ifdef __USE_PROTOS
static Graph BuildNFA_set(char *s)
#else
static Graph BuildNFA_set( s )
char *s;
#endif
{
      Graph g;
      
      if ( s == NULL ) return g;
      
      g.left = newNode();
      g.right = newNode();
      while ( *s != '\0' )
      {
            ArcBetweenGraphNodes(g.left, g.right, *s++);
      }
      return g;
}

#ifdef __USE_PROTOS
static Graph BuildNFA_Astar(Graph A)
#else
static Graph BuildNFA_Astar( A )
Graph A;
#endif
{
      Graph g;

      g.left = newNode();
      g.right = newNode();
      
      ArcBetweenGraphNodes(g.left, A.left, Epsilon);
      ArcBetweenGraphNodes(g.left, g.right, Epsilon);
      ArcBetweenGraphNodes(A.right, g.right, Epsilon);
      ArcBetweenGraphNodes(A.right, A.left, Epsilon);
      
      return( g );
}

#ifdef __USE_PROTOS
static Graph BuildNFA_Aplus(Graph A)
#else
static Graph BuildNFA_Aplus( A )
Graph A;
#endif
{
      ArcBetweenGraphNodes(A.right, A.left, Epsilon);
      
      return( A );
}

#ifdef __USE_PROTOS
static Graph BuildNFA_Aoptional(Graph A)
#else
static Graph BuildNFA_Aoptional( A )
Graph A;
#endif
{
      Graph g;
      
      g.left = newNode();
      g.right = newNode();
      
      ArcBetweenGraphNodes(g.left, A.left, Epsilon);
      ArcBetweenGraphNodes(g.left, g.right, Epsilon);
      ArcBetweenGraphNodes(A.right, g.right, Epsilon);
      
      return( g );
}

Generated by  Doxygen 1.6.0   Back to index