Boyer-Moore based fgrep like program

sources-request at genrad.UUCP sources-request at genrad.UUCP
Thu Jul 4 00:24:55 AEST 1985


Mod.sources:  Volume 2, Issue 3
Submitted by: Arnold Robbins <linus!gatech!arnold>


The following program may be of some use to people out there.  In the usual
case of looking for a single string, it is 10% to 20% faster than fgrep.

There is no makefile, since everything is in a single source file.  Just
compile with "cc -O bgrep.c -o bgrep".

Enjoy,

Arnold Robbins
arnold at gatech.{UUCP, CSNET}
-------------- cut here ------------------------
#!/bin/sh
# This is a shell archive, meaning:
# 1. Remove everything above the #!/bin/sh line.
# 2. Save the resulting text in a file.
# 3. Execute the file with /bin/sh (not csh) to create the files:
#	bgrep.c
#	bgrep.1
# This archive created: Mon Jul  1 10:52:37 1985
# By:	Arnold Robbins (Pr1mebusters!)
export PATH; PATH=/bin:$PATH
echo shar: extracting "'bgrep.c'" '(8635 characters)'
if test -f 'bgrep.c'
then
	echo shar: over-writing existing file "'bgrep.c'"
fi
cat << \SHAR_EOF > 'bgrep.c'
/* bgrep --- grep using Boyer-Moore pattern matcher */

/*
 * All the wrapper code (argument and file handling, printing, etc.) by
 * Arnold Robbins, gatech!arnold
 *
 * Boyer-Moore pattern matching, coded by Roy Mongiovi, gatech!gitpyr!roy
 * and edited some by Arnold Robbins.
 *
 * No warranty of suitability for any purpose, either expressed
 * or implied, is made.
 */

#include <stdio.h>
#include <ctype.h>

#define TRUE	1
#define FALSE	0

#define MAXPATS	120		/* (arbitrary maximum number of patterns */
#define MAXLINE	(256 + 1)

/*
 * command line options
 */

int Allbut = FALSE;		/* print lines that don't match pattern */
int Exact = FALSE;		/* only print lines that match exactly */
int Countlines = FALSE;		/* only print a count of matching lines */
int Listnames = FALSE;		/* only list file names that match */
int Numberlines = FALSE;	/* print relative line number */
int Silent = FALSE;		/* don't print anything, just exit */
int Monocase = FALSE;		/* ignore case distinctions */

/*
 * variables
 */

long Curline = 0;		/* current file input line */
long Lines_matched = 0;		/* how many lines matched the pattern */

int Lotsafiles = FALSE;		/* are there more than one file? */
int Pat_length[MAXPATS];	/* length of pattern */
int Line_length = 0;		/* length of line */
int Couldnt_open_files = FALSE;	/* one or more files could not be opened */
int Exit_val = 0;		/* return code status */
int Curpat = 0;			/* current pattern comparing against */
int Numpats = 0;		/* total number of patterns */

char Inbuf[MAXLINE];		/* input buffer */
char Pattern[MAXPATS][MAXLINE];	/* pattern to be matched; init = NULs */
char *Program = NULL;		/* program name */

int Argc;			/* make argc and argv global */
char **Argv;

extern char *basename();	/* leaf file of a pathname */

main (argc, argv, envp)
int argc;
char **argv, **envp;
{
	register int i, j;
	char *index();

	Program = basename (argv[0]);
	Argc = argc;
	Argv = argv;

	parse_args ();		/* deal with command line */

	if (Pattern[0][0] == '\0')	/* not from -f or -e */
	{
		if (Argv[0] == NULL)	/* no string given */
			usage ();
		setpats (Argv[0]);
		Argc--;
		Argv++;
	}

	for (Curpat = 0; Curpat <= Numpats; Curpat++)
	{
		if (Monocase)
			mapdown (Pattern[Curpat]);

		Pat_length[Curpat] = strlen (Pattern[Curpat]);
		initialize ();		/* set up necessary tables */
	}

	Lotsafiles = (Argc > 1);	/* more than one file left */

	if (Argc == 0)		/* search stdin */
		process ("-");
	else
		for (i = 0; Argv[i] != NULL; i++)
			process (Argv[i]);
	
	if (! Silent && Countlines)
		printf ("%ld\n", Lines_matched);

	if (Lines_matched == 0)
		exit (1);
	else if (Couldnt_open_files)
		exit (2);
	else
		exit (0);
}

/* process --- start doing the work on each file */

process (infile)
register char *infile;
{
	FILE *fp;
	int c;
	int success;
	long prev_lines_matched = Lines_matched;	/* save count */

	Curline = 0;	/* reset for each file */

	if (infile[0] == '-' && infile[1] == '\0')
		fp = stdin;
	else if ((fp = fopen (infile, "r")) == NULL)
	{
		Couldnt_open_files = TRUE;
		perror (infile);
		return;
	}

	while (fgets (Inbuf, sizeof Inbuf, fp) != NULL)
	{
		Curline++;
		if (Monocase)
			mapdown (Inbuf);
		Line_length = strlen (Inbuf);

		/* first, throw away rest of a truncated input line */
		if (Inbuf[Line_length - 1] != '\n')
			while ((c = getc (fp)) != '\n')
				;
		else
			Inbuf[--Line_length] = '\0';
			/* newline is there, nuke it */

		for (Curpat = 0; Curpat <= Numpats; Curpat++)
		{
			if (success = match ())
				Lines_matched++;
			/* do any necessary output */
			if (! Silent && ! Countlines && ! Listnames &&
				((success != FALSE) ^ (Allbut != FALSE)))
				/* either success, or Allbut, but not both,
				   and not neither */
			{
				if (Lotsafiles)
					printf ("%s: ", infile);
				if (Numberlines)
					printf ("%ld: ", Curline);
				printf ("%s\n", Inbuf);
			}
		}
	}

	fclose (fp);
	if (! Silent && Listnames && prev_lines_matched < Lines_matched)
		printf ("%s\n", infile);
}

/* parse_args --- check out command line arguments */

parse_args ()
{
	register int i,j;

	if (Argc == 1)
		usage ();

	for (Argc--, Argv++; Argv[0] != NULL && Argv[0][0] == '-'; Argc--, Argv++)
	{
		int cheat = FALSE;

		for (j = 1; Argv[0][j] != '\0'; j++)
		{
			switch (Argv[0][j]) {
			case 'c':
				Countlines = TRUE;
				break;

			case 'e':
				strcpy (Pattern[0], Argv[1]);
				Pattern[0][sizeof Pattern[0] - 1] = '\0';
				cheat = TRUE;
				continue;

			case 'f':
				patfromfile (Argv[1]);
				cheat = TRUE;
				continue;

			case 'i':
			case 'y':
				Monocase = TRUE;
				break;

			case 'l':
				Listnames = TRUE;
				break;

			case 'n':
				Numberlines = TRUE;
				break;

			case 's':
				Silent = TRUE;
				break;

			case 'v':
				Allbut = TRUE;
				break;

			case 'x':
				Exact = TRUE;
				break;

			default:
				usage ();
				break;
			}
		}
		if (cheat)
		{
			cheat = FALSE;
			Argc--;
			Argv++;
			/* boy is this stuff a kludge! */
		}
	}

	/* check for argument conflicts */
	if (
		(Silent &&
			(Allbut || Exact || Countlines || Listnames ||
				Numberlines))
		||
		(Allbut && Exact)
		||
		(Countlines && Listnames)
	)
	{
		fprintf (stderr, "%s: argument conflict -- see the man page\n",
			Program);
		usage ();	/* will exit */
	}
}

/* mapdown --- remove case distinctions in a string */

mapdown (str)
register char *str;
{
	register int i;

	for (i = 0; str[i] != '\0'; i++)
		if (isupper (str[i]))
			str[i] = tolower (str[i]);
}

/* return basename part of a pathname, if '/'s are present */

char *basename (str)
register char *str;
{
	register int i = 0;
	register int j = 0;

	for (; str[i] != '\0'; i++)
		if (str[i] == '/')
			j = i;
	
	if (j != 0)
		return (& str[++j]);
	else
		return (str);
}

/* usage --- print usage message and die */

usage ()
{
	fprintf (stderr, "usage: %s [-vxclnisef] [string] [files]\n",
			Program);
	exit (2);
}

/* index --- do index by hand so it'll work on any Unix */

char *index (str, c)
char *str, c;
{
	for (; *str; str++)
		if (*str == c)
			return (str);
	
	return (NULL);
}

/* patfromfile --- retrieve the pattern from the named file */

patfromfile (infile)
char *infile;
{
	register int i, j;
	register FILE *fp;
	register int c;

	if ((fp = fopen (infile, "r")) == NULL ||
			(c = getc (fp)) == EOF)
	{
		perror (infile);	/* be like standard grep */
		exit (2);
	}
	else
		ungetc (c, fp);

	for (i = 0; fgets (Pattern[i], MAXLINE, fp) != NULL; i++)
	{
		if (i >= 120)
		{
			fprintf (stderr, "%s: Only %d strings allowed\n",
				Program, MAXPATS);
			exit (2);
		}
		j = strlen (Pattern[i]);
		if (Pattern[i][j - 1] == '\n')
			Pattern[i][--j] = '\0';
	}
	Numpats = i - 1;

	fclose (fp);
}

/* setpats --- set up the patterns from a string */

setpats (str)
register char *str;
{
	register int i, j;

	while (*str == '\n' || *str == '\r')
		str++;

	for (i = j = 0; *str; str++)
	{
		if (*str == '\n')
		{
			Pattern[i][j] = '\0';
			j = 0;
			i++;
		}
		else
			Pattern[i][j++] = *str;
	}
	Numpats = i;
}

/* begin magic stuff for Boyer-Moore pattern matching */

int D1[MAXPATS][128];
int D2[MAXPATS][128];

int F[MAXPATS][128];

initialize ()
{
	register int i, t;

	for (i = 0; i < 128; i++)
		D1[Curpat][i] = Pat_length[Curpat];
	
	for (i = 0; i < Pat_length[Curpat]; i++)
		D1[Curpat][Pattern[Curpat][i]] = Pat_length[Curpat] - i - 1;
	
	for (i = 0; i < Pat_length[Curpat]; i++)
		D2[Curpat][i] = (Pat_length[Curpat] << 1) - i - 1;
	
	for (i = (t = Pat_length[Curpat]) - 1; i >= 0; i--, t--)
		for (F[Curpat][i] = t; t < Pat_length[Curpat]
			&& Pattern[Curpat][i] != Pattern[Curpat][t];
							t = F[Curpat][t])
			if (Pat_length[Curpat] - i - 1 < D2[Curpat][t])
				D2[Curpat][t] = Pat_length[Curpat] - i - 1;

	for (i = 0; i <= t; i++)
		if (Pat_length[Curpat] + t - i < D2[Curpat][i])
			D2[Curpat][i] = Pat_length[Curpat] + t - i;
}

/* match --- do Boyer-Moore pattern search on input buffer */

match ()
{
	register int i, j;

	if (Exact && Pat_length[Curpat] != Line_length)
		return FALSE;

	i = Pat_length[Curpat] - 1;

	while (i < Line_length)
	{
		j = Pat_length[Curpat] - 1;
		while (j >= 0)
		{
			if (Inbuf[i] == Pattern[Curpat][j])
				i--, j--;
			else
				break;
		}

		if (j < 0)
		{
			/* found a match */
			return TRUE;
			/*
			 * note: if we were going to seach for further matches
			 * on the input line, we would do this:
			 *
			 * i += Pat_length[Curpat] + 1;
			 *
			 * which shifts right by Pat_length[Curpat] + 1 places
			 */
		}
		else
		{
			j = (D1[Curpat][Inbuf[i]] >= D2[Curpat][j]) ?
				D1[Curpat][Inbuf[i]]
			:
				D2[Curpat][j];
			i += j;
			/* shift right by j places */
		}
	}

	return FALSE;
}
SHAR_EOF
echo shar: extracting "'bgrep.1'" '(2419 characters)'
if test -f 'bgrep.1'
then
	echo shar: over-writing existing file "'bgrep.1'"
fi
cat << \SHAR_EOF > 'bgrep.1'
.TH BGREP 1 "Georgia Tech"
.SH NAME
bgrep \- search a file for one or more simple strings
.SH SYNOPSIS
.B bgrep
[ options ] string-list [ files ]
.SH DESCRIPTION
.I Bgrep
(Boyer\-Moore grep) is program patterned after
.IR fgrep (1).
It uses the Boyer\-Moore string searching algorithm, which actually gets
faster as the length of the pattern to be searched for increases.
.I Bgrep
searches for plain
.I strings\^
(separated by newlines in the
.I string-list\^
argument),
not regular expressions in the style of
.IR grep .
.PP
The following
.I options
are recognized:
.TP
.B \-v
All lines but those matching are printed.
.TP
.B \-x
(Exact) only lines matched in their entirety are printed.
.TP
.B \-c
Only a count of the matching lines are printed.
This is the total count, across all the input files.
.TP
.BR \-i " or " \-y
Ignore case when trying to match a line.
Both versions of this option are accepted for compatibility with
.I grep
on different versions of Unix.
.TP
.B \-l
Only the names of files with matching lines are printed (once),
separated by newlines.
.TP
.B \-n
Each line is preceded by its relative line number within the file.
.TP
.B \-s
Silent mode:  No output is generated (except error messages).
This is useful for just checking the exit status.
.TP
.BI \-e " string"
Same as a simple
.I string
argument, but useful when the string begins with a \-.
.TP
.BI \-f " file"
The strings to be searched for are read from
.IR file .
.PP
The arguments to the
.BR \-e " and " \-f
options
.I must
be given as separate program arguments, i.e. separated by white space.
.PP
.I Bgrep
catches conflicting arguments (e.g.
.BR \-v " and " \-x )
and complains.
.SH SEE ALSO
.IR ed (1),
.IR sed (1),
.IR grep (1),
.IR sh (1)
.SH DIAGNOSTICS
Exit status 0 if any matches were found, 1 if none, 2 for syntax
errors on the command, or if any files could not be opened (even if
matches were found).
.SH BUGS
Input lines and the strings to be searched for are limited to 256 characters.
Longer input lines are truncated.
.PP
.I Bgrep\^
will not search for any more than 120 different strings.
.PP
Uses the <stdio.h> package, which slows it down some.
Nonetheless, in the usual case,
it is 10% to 20% faster than
.IR fgrep (1).
.SH AUTHORS
Roy Mongiovi (gatech!gitpyr!roy) coded the guts of the Boyer\-Moore algorithm,
while Arnold Robbins (gatech!arnold) wrote the code to do all the rest of
the work, and the man page.
SHAR_EOF
#	End of shell archive
exit 0



More information about the Mod.sources mailing list