Category : Files from Magazines
Archive   : CGAZV3N4.ZIP
Filename : BM.C
*
* Boyer-Moore string search routine
*
* Author: John Rex
* References: (1) Boyer RS, Moore JS: "A fast string searching
* algorithm" CACM 20(10):762-777, 1977
* (2) plus others--see text of article
*
* Compilers: Microsoft C V5.1 - compile as is
* Turbo C V2.0 - compile as is
*
* Compile time preprocessor switches:
* DEBUG - if defined, include test driver
*
* Usage:
*
* char *pattern, *text; - search for pattern in text
* unsigned length; - length of text (the routine does
* NOT stop for '\0' bytes, thus
* allowing it to search strings
* stored sequentially in memory.
* char *start; - pointer to match
*
* char *Boyer_Moore(char *, char *, unsigned);
*
* start = Boyer_Moore(pattern, text, strlen(text);
*
* NULL is returned if the search fails.
*
* Switches: if defined:
*
* DEBUG will cause the search routine to dump its tables
* at various times--this is useful when trying to
* understand how upMatchJump is generated
*
* DRIVER will cause a test drive to be compiled
*
* Source code may be used freely if source is acknowledged.
* Object code may be used freely.
**************************************************************/
#define DRIVER
#if defined(DEBUG)
#define SHOWCHAR for (uT=1; uT<=uPatLen; uT++) \
printf(" %c ", pcPattern[uT-1])
#define SHOWJUMP for (uT=1;uT<=uPatLen;uT++) \
printf("%2d ", upMatchJump[uT])
#define SHOWA printf(" uA = %u ", uA)
#define SHOWB printf(" uB = %u", uB)
#define SHOWBACK for (uT=1;uT<=uPatLen;uT++) \
printf("%2d ", upBackUp[uT])
#define NL printf("\n")
unsigned uT;
#else
#define SHOWCHAR
#define SHOWJUMP
#define SHOWA
#define SHOWB
#define SHOWBACK
#define NL
#endif
#include
#include
#include
#define AlphabetSize 256
char *Boyer_Moore(pcPattern, pcText, uTextLen)
char *pcPattern; /* we search for this ... */
char *pcText; /* ... in this text ... */
unsigned uTextLen; /* ... up to this length */
{
/* array of character mis-match offsets */
unsigned uCharJump[AlphabetSize];
/* array of offsets for partial matches */
unsigned *upMatchJump;
/* temporary array for upMatchJump calc */
unsigned *upBackUp;
unsigned u, uPatLen;
unsigned uText, uPat, uA, uB;
/* Setup and initialize arrays */
uPatLen = strlen(pcPattern);
upMatchJump = (unsigned *)
malloc(2 * (sizeof(unsigned) * (uPatLen + 1)) );
upBackUp = upMatchJump + uPatLen + 1;
/* Heuristic #1 -- simple char mis-match jumps ... */
memset(uCharJump, 0, AlphabetSize*sizeof(unsigned));
for (u = 0 ; u < uPatLen; u++)
uCharJump[((unsigned char) pcPattern[u])]
= uPatLen - u - 1;
/* Heuristic #2 -- offsets from partial matches ... */
for (u = 1; u <= uPatLen; u++)
upMatchJump[u] = 2 * uPatLen - u;
/* largest possible jump */
SHOWCHAR; NL;
SHOWJUMP; NL;
u = uPatLen;
uA = uPatLen + 1;
while (u > 0) {
upBackUp[u] = uA;
while( uA <= uPatLen &&
pcPattern[u - 1] != pcPattern[uA - 1]) {
if (upMatchJump[uA] > uPatLen - u)
upMatchJump[uA] = uPatLen - u;
uA = upBackUp[uA];
}
u--;
uA--;
}
SHOWJUMP; SHOWA; SHOWBACK; NL;
for (u = 1; u <= uA; u++)
if (upMatchJump[u] > uPatLen + uA - u)
upMatchJump[u] = uPatLen + uA - u;
uB = upBackUp[uA];
SHOWJUMP; SHOWB; NL;
while (uA <= uPatLen) {
while (uA <= uB) {
if (upMatchJump[uA] > uB - uA + uPatLen)
upMatchJump[uA] = uB - uA + uPatLen;
uA++;
}
uB = upBackUp[uB];
}
SHOWJUMP; NL;
/* now search */
uPat = uPatLen; /* tracks position in Pattern */
uText = uPatLen - 1; /* tracks position in Text */
while (uText < uTextLen && uPat != 0) {
if (pcText[uText] == pcPattern[uPat - 1]) { /* match? */
uText--; /* back up to next */
uPat--;
}
else { /* a mismatch - slide pattern forward */
uA = uCharJump[((unsigned char) pcText[uText])];
uB = upMatchJump[uPat];
uText += max(uA, uB); /* select larger jump */
uPat = uPatLen;
}
}
/* return our findings */
if (uPat == 0)
return(pcText + (uText + 1)); /* have a match */
else
return (NULL); /* no match */
}
#pragma subtitle("Test Driver")
#ifdef DRIVER
#define PATTERN argv[1]
#define TEXT_SIZE 32000u
void main(argc, argv)
int argc;
char **argv;
{
char *start, *p;
char text[TEXT_SIZE];
int i;
unsigned length, count;
FILE *infile;
if (argc != 3) {
puts("Usage is: bm pattern file\n");
exit(0);
}
if ((infile = fopen(argv[2], "r")) == NULL) {
puts("Can't open input file\n");
exit(0);
}
/* read in whole file */
length = fread(text, 1, TEXT_SIZE, infile);
fclose(infile);
p=text;
count=0;
while (count < length) {
if (*p == '\n')
*p = '\0';
p++;
count++;
}
/* now search repeatedly */
start = Boyer_Moore(PATTERN, text, length);
while (start != NULL) {
for (p = start; ; p--) { /* find start of line */
if (*p == '\0') {
p++;
break;
}
else if (p == text)
break;
}
printf("Match:\n%s\n", p);
for (i=start-p; i>0; i--)
fputc(' ', stdout);
printf("%s\n\n", PATTERN);
start = Boyer_Moore(PATTERN, start+1,
length - (start-text) - 1);
}
}
#endif
Very nice! Thank you for this wonderful archive. I wonder why I found it only now. Long live the BBS file archives!
This is so awesome! 😀 I’d be cool if you could download an entire archive of this at once, though.
But one thing that puzzles me is the “mtswslnkmcjklsdlsbdmMICROSOFT” string. There is an article about it here. It is definitely worth a read: http://www.os2museum.com/wp/mtswslnk/