Category : Files from Magazines
Archive   : CGAZV5N1.ZIP
Filename : MATCH.C

 
Output of file : MATCH.C contained in archive : CGAZV5N1.ZIP

/*********** MATCH.C ************************ Listing 1 ***********
* Author: Allen I. Holub
* Function: A group of subroutines to find a substring represented
* by a grep-like regular expression in a second string.
* Compiler: Any ANSI Standard C compiler.
* Verified for: Turbo C++ 1.0, Microsoft C 6.0
* Memory Models: any
*
* Compile-time switches: -DMAIN to make a small grep-like main()
*
* (c) C Gazette. May be used freely as long as author and publication
* are acknowledged
*----------------------------------------------------------------------
*/
#include
#include
#include
#include


#ifdef DEBUG
#define D(x) x /* If DEBUG is defined, D(printf("hello");) expands */
#else /* to printf("hello"). If DEBUG isn't defined, D(...) */
#define D(x) /* is expanded to an empty string, effectively */
#endif /* removing the printf() statement from the input. */


/* Metacharacters in the input: */
#define BOL '^' /* start-of-line anchor */
#define EOL '$' /* end-of-line anchor */
#define ANY '.' /* matches any character */
#define CCL '[' /* start a character class */
#define CCLEND ']' /* end a character class */
#define NCCL '^' /* negates character class if 1st char. */
#define CLOSURE '*' /* Kleene closure (matches 0 or more) */
#define PCLOSE '+' /* Positive closure (1 or more) */
#define OPT '?' /* Optional closure (0 or 1) */

typedef enum action /* These are put in the pattern string */ {
/* to represent metacharacters. */
M_BOL = ( 0x80 | '^' ), M_EOL = ( 0x80 | '$' ),
M_ANY = ( 0x80 | '.' ), M_CCL = ( 0x80 | '[' ),
M_OPT = ( 0x80 | '?' ), M_CLOSE = ( 0x80 | '*' ),
M_PCLOSE = ( 0x80 | '+' )
}
action;

typedef unsigned char pattern; /* pattern strings are unsigned char */

#define IS_ACTION(x) ((x)&0x80) /* true => element of pat. string is an */
/* action that represents a metacharacter */

#define MAXPAT 128 /* max num. of pattern elements. Remember that */
/* character classes require 17 pattern elements */

/*----------------------------------------------------------------------*/
#define MAPSIZE 16 /* need this many bytes for character class bit map */

/* Advance a pointer into the pattern template */
/* to the next pattern element, this is a +1 for */
/* all pattern elements but M_CCL, where you */
/* to skip past both the M_CCL character and the */
/* bitmap that follows that character */

#define ADVANCE(pat) (pat += (*pat == M_CCL) ? (MAPSIZE+1) : 1)

/* Bitmap functions. Set bit b in the map and */
/* test bit b to see if it was set previously */

#define SETBIT(b,map) ((map)[((b) & 0x7f) >>3] |= (1<< ((b) & 0x07)) )
#define TSTBIT(b,map) ((map)[((b) & 0x7f) >>3] & (1<< ((b) & 0x07)) )
/*----------------------------------------------------------------------*/
#define E_NONE 0 /* Possible return values from pat_err.*/
#define E_ILLEGAL 1 /* Set in makepat() to indicate prob- */
#define E_NOMEM 2 /* lems that came up while making the */
#define E_PAT 3 /* pattern template. */

static int Error = E_NONE; /* error flag, like errno */
int pat_err(){ return Error; } /* returns current error status */

/*----------------------------------------------------------------------*/
static pattern * doccl (pattern *map, unsigned char *src);
extern pattern * makepat (char *exp);
extern char * match (char *look_for, char *in_this_string);
extern int pat_err (void );
extern char * matchs (char *str, pattern *pat, int ret_endp);
static int omatch (char **strp, pattern *pat, char *start);
extern char * patcmp (char *str, pattern *pat, char *start);
extern int esc (char **);

/*----------------------------------------------------------------------*/
pattern *makepat( exp )
char *exp; /* regular expression */
{
/* Make a pattern template from the string pointed to by exp.
* Stop when '\0' or '\n' is found in exp.
* Return: a pointer to the pattern template on success, NULL
* on failure (in which case the pat_err() subroutine
* will return one of the following values:
*
* E_ILLEGAL Illegal input pattern.
* E_NOMEM out of memory
* E_PAT pattern too long.
*/

pattern *pat; /* pattern template being assembled */
pattern *cur; /* pointer to current pattern element */
pattern *prev; /* pointer to previous pattern element */

pat = NULL;
Error = E_ILLEGAL;
if(!*exp || *exp=='\n' )
goto exit;

if( *exp==CLOSURE || *exp==PCLOSE || *exp==OPT )
goto exit;

Error = E_NOMEM;
if( ! (pat = malloc(MAXPAT)) ) /* get pattern buffer */
goto exit;
D( memset(pat, 0, MAXPAT); ) /* zero buffer if debugging */

prev = cur = pat;
Error = E_PAT;

while( *exp && *exp != '\n' ) {
if( cur >= &pat[MAXPAT-1] ) {
free( pat );
pat = NULL;
goto exit;
}

switch( *exp ) {
case ANY:
*cur = M_ANY;
prev = cur++;
++exp;
break;

case BOL:
*cur = (cur == pat) ? M_BOL : *exp;
prev = cur++;
++exp;
break;

case EOL:
*cur = (!exp[1] || exp[1]=='\n') ? M_EOL : *exp;
prev = cur++;
++exp;
break;

case CCL:
if( ((cur - pat) + MAPSIZE) >= MAXPAT ) {
free( pat );
pat = NULL;
goto exit; /* not enough room for bit map */
}

prev = cur;
*cur++ = M_CCL;
exp = doccl( cur, exp );
cur += MAPSIZE ;
break;

case OPT: case CLOSURE: case PCLOSE:
switch( *prev ) {
case M_BOL: case M_EOL:
case M_OPT:
case M_PCLOSE: case M_CLOSE:
free( pat );
pat = NULL;
goto exit;
}

memmove( prev+1, prev, cur-prev );
*prev = (*exp == OPT) ? M_OPT :
(*exp == PCLOSE) ? M_PCLOSE : M_CLOSE ;
++cur;
++exp;
break;

default:
prev = cur;
*cur++ = esc( &exp );
break;
}
}

*cur = '\0';
Error = E_NONE;

exit: return( pat );
}

/*----------------------------------------------------------------------*/
static pattern *doccl( map, src )
pattern *src, *map;
{
/* Set bits in the map corresponding to characters specified in
* the src character class. */

int first, last, negative;
pattern *start = src;

++src; /* skip past the [ */
if( negative = (*src == NCCL) ) /* check for negative ccl */
++src;
start = src; /* start of characters in class */
memset(map, 0, MAPSIZE); /* bitmap initially empty */

while( *src && *src != CCLEND ) {
if( *src != '-') {
first = esc(&src); /* Use temp. to avoid macro */
SETBIT( first, map ); /* side effects. */
}

else if( src == start ) {
SETBIT( '-', map ); /* literal dash at start or end */
++src;
}
else {
++src; /* skip to end-of-sequence char */
if( *src < src[-2] ) {
first = *src;
last = src[-2];
}
else {
first = src[-2];
last = *src;
}
while( ++first <= last )
SETBIT( first, map );
src++;
}
}

if( *src == CCLEND )
++src; /* Skip CCLEND */

if( negative )
for( first = MAPSIZE; --first >= 0 😉
*map++ ^= ~0; /* invert all bits */

return src;
}

/*---------------------------------------------------------------*/
char *match( look_for, in_this_string )
char *look_for;
char *in_this_string;
{
/* Return a pointer to the characters matching look_for
* if it's in the string, else return NULL. You can use
* the pat_err() subroutine to distinguish a bad regular
* expression from a string-not-found condition [NULL is
* returned from match() in both cases].
*/

pattern *pat;
char *rval = NULL;

if( pat = makepat(look_for) ) /* make the pattern */ {
rval = matchs( in_this_string, pat, 0 ); /* see if it's there */
free( pat );
}
return rval;
}

/*----------------------------------------------------------------------*/
char *matchs( str, pat, ret_endp )
char *str;
pattern *pat;
int ret_endp;
{
/* Uses patcmp() to look for a match of pat anywhere in str using
* a brute-force algorithm. str is a character string while pat is
* a pattern template made by makepat(). Returns:
* 1. NULL if no match was found.
* 2. A pointer the last character satisfying the match if
* ret_endp is true.
* 3. A pointer to the beginning of the matched string
* if ret_endp is false.
*/

char *start;
char *end = NULL;

if( !pat ) /* This lets you do: matchs(str,makepat(...),?);*/
return NULL; /* without horrible consequences if makepat() */
/* fails. */
if( !*str ) {
if((*pat == M_EOL) ||
(*pat == M_BOL && (!pat[1] || pat[1]==M_EOL)))
end = str;
}
else {
start = str; /* Do a brute-force substring search, com- */
while( *str ) /* paring a pattern against the input string */ {
if( !(end = patcmp(str, pat, start)) )
str++;
else /* successful match */ {
if( !ret_endp )
end = str ;
break;
}
}
}
return( end );
}

/*----------------------------------------------------------------------*/
char *patcmp( str, pat, start ) /* formerly amatch() */
char *str, *start;
pattern *pat;
{
/* Like strcmp, but compares str against pat. Each element of str
* is compared with the template until either a mis-match is
* found or the end of the template is reached. In the former
* case a 0 is returned; in the latter, a pointer into str
* (pointing to the last character in the matched pattern)
* is returned. Strstart points at the first character in the
* string, which might not be the same thing as line if the
* search started in the middle of the string.
*/

char *bocl, /* beginning of closure string. */
*end; /* return value: end-of-string pointer. */

if( !pat ) /* make sure pattern is valid */
return( NULL );

while( *pat ) {
if( *pat == M_OPT ) {
/* Zero or one matches. It doesn't matter if omatch
* fails---it will advance str past the character on
* success, though. Always advance the pattern past
* both the M_OPT and the operand.
*/

omatch( &str, ++pat, start );
ADVANCE(pat);
}
else if( !(*pat == M_CLOSE || *pat == M_PCLOSE) ) {
/* Do a simple match. Note that omatch() fails if there's
* still something in pat but we're at end of string.
*/

if( !omatch( &str, pat, start ) )
return NULL;

ADVANCE(pat);
}
else /* Process a Kleene or positive closure */ {
if( *pat++ == M_PCLOSE ) /* one match required */
if( !omatch(&str, pat, start ) )
return NULL;

/* Match as many as possible, zero is okay */

bocl = str;
while ( *str && omatch(&str, pat, start) )
;

/* 'str' now points to the character that made
* made us fail. Try to process the rest of the
* string. If the character following the closure
* could have been in the closure (as in the pattern
* "[a-z]*t") the final 't' will be sucked up in the
* while loop. So, if the match fails, back up a notch
* and try to match the rest of the string again,
* repeating this process recursively until we get back
* to the beginning of the closure. The recursion
* goes, at most, one levels deep.
*/

if( *ADVANCE(pat) ) {
for(; bocl <= str; --str )
if( end = patcmp(str, pat, start) )
break;
return end;
}
break;
}
}

/* omatch() advances str to point at the next
* character to be matched. So str points at the
* character following the last character matched
* when you reach the end of the template. The
* exceptions are templates containing only a
* BOLN or EOLN token. In these cases omatch doesn't
* advance. Since we must return a pointer to the last
* matched character, decrement str to make it point at
* the end of the matched string, making sure that the
* decrement hasn't gone past the beginning of the string.
*
* Note that $ is a position, not a character, but in the case
* of a pattern ^$, a pointer to the end of line character is
* returned. In ^xyz$, a pointer to the z is returned.
*
* The --str is done outside the return statement because
* max() is often a macro that has side-effects.
*/

--str;
return( max(start, str) );
}

/*----------------------------------------------------------------------*/
static int omatch( strp, pat, start )
char **strp;
pattern *pat;
char *start;
{
/* Match one pattern element, pointed at by pat, against the
* character at **strp. Return 0 on a failure, 1 on success.
* *strp is advanced to skip over the matched character on a
* successful match. Closure is handled one level up by patcmp().
*
* "start" points at the character at the left edge of the
* line. This might not be the same thing as *strp if the
* search is starting in the middle of the string. An end-of-
* line anchor matches '\n' or '\0'.
*/

int advance = -1; /* amount to advance *strp, -1 == error */

switch( *pat ) {
case M_BOL: /* First char in string? */
if( *strp == start ) /* Only one star here. */
advance = 0;
break;

case M_ANY: /* . = anything but newline */
if( **strp != '\n' )
advance = 1;
break;

case M_EOL:
if( **strp == '\n' || **strp == '\0' )
advance = 0;
break;

case M_CCL:
if( TSTBIT( **strp, pat + 1 ) )
advance = 1;
break;

default: /* literal match */
if( **strp == *pat )
advance = 1;
break;
}

if( advance > 0 )
*strp += advance;

return( advance + 1 );
}

/*----------------------------------------------------------------------*/
#ifdef MAIN
main( int argc, char **argv )
{
static pattern *pat;
static FILE *inp;
static char inp_buf[ 1024 ];

if( argc < 2 || argv[1][0]=='-' ) {
fprintf(stderr, "Usage: match reg_exp filename\n");
fprintf(stderr, "Usage: match reg_exp < filename\n");
exit( 1 );
}

if( !(pat = makepat( argv[1] )) ) {
fprintf(stderr, "Can't make expression template\n");
exit( 1 );
}

if( argc == 2 )
inp = stdin;
else if( !(inp = fopen(argv[2],"r")) ) {
perror( argv[2] );
exit( 1 );
}

while( fgets( inp_buf, sizeof(inp_buf), inp ) )
if( matchs(inp_buf, pat, 0) )
fputs( inp_buf, stdout );

exit( 0 );
}
#endif