/*

------------------------------------------------------------------------------

A license is hereby granted to reproduce this software source code and
to create executable versions from this source code for personal,
non-commercial use.  The copyright notice included with the software
must be maintained in all copies produced.

THIS PROGRAM IS PROVIDED "AS IS". THE AUTHOR PROVIDES NO WARRANTIES
WHATSOEVER, EXPRESSED OR IMPLIED, INCLUDING WARRANTIES OF
MERCHANTABILITY, TITLE, OR FITNESS FOR ANY PARTICULAR PURPOSE.  THE
AUTHOR DOES NOT WARRANT THAT USE OF THIS PROGRAM DOES NOT INFRINGE THE
INTELLECTUAL PROPERTY RIGHTS OF ANY THIRD PARTY IN ANY COUNTRY.

Copyright (c) 1995, 1996, John Conover, All Rights Reserved.

Comments and/or bug reports should be addressed to:

    john@johncon.com (John Conover)

------------------------------------------------------------------------------

bmhsearch.c,

    note: the rules of the data area to be searched are:

        1) the search area must start at page[0], (but can constitute
        a smaller area of the page data space,) and the search area
        must end a ' ' character; it is a requirement of bmhsearch(),
        in bmhsearch.c, that the '\0' character is reserved as an end
        of search sentinel in the pattern-failure to observe this rule
        will result in a program that is erratic and either hangs
        forever, or perhaps does a core dump of a very involved data
        structure, that is very difficult to analyze-see also
        uppercase.c and bmhsearch.c

        2) the return value, count, must be the size of the data
        space, in page to be searched, *_NOT_* including the last ' '
        character

These rules are required by the high speed search algorithm,
bmhsearch(), to force the character compare loop to break at the end
of the data area. The trailing EOS, '\0', of the pattern and the
trailing ' ' of the data area to be searched will provide a character
mis-match, and break the loop, if the last word of the data area and
the pattern match.

The assembled data structure is as follows:

    1) the original symbols, referenced by the token, buffer, is
    parsed into a null terminated list of symbols referenced by the
    token, tokens

    2) a list of ELEMENT structures, in postfix order, referenced
    by the postfix_stack token

        a) each token has an ELEMENT structure, which contains a
        reference to the tokens registers for the token, the token
        type or precedence, a reference to the next ELEMENT structure
        in the the postfix stack, a reference to an EVAL structure,
        and a reference to a BMHPATTERN structure

        b) each ELEMENT structure in the postfix list references two
        structures:

            i) a BMHPATTERN structures, which has a reference to the
            pattern to be searched, the length of the pattern to be
            searched, a reference to the Boyer-Moore-Horspool-Sunday
            jump table, the number of matches that are found in the
            search, and a reference to the next BMHPATTERN structure
            in the pattern stack (this structure does not exist if
            the corresponding token is an operator)

            ii) an EVAL structure, which has a reference to the
            function that will be called to evaluate the relevance of
            the search, the number of matches that are found in the
            search, and a reference to the next EVAL structure in the
            evaluation stack

    3) a list of the BMHPATTERN structures, referenced by the token,
    pattern_stack

    4) a list of the EVAL structures, referenced by the token,
    eval_stack

From postfix.c, tokens contains the contents of buffer, with the
tokens separated by exactly one '\0' character, and no whitespace,
ie., if the contents of buffer were:

                  +------------------
    buffer------->|sym1 sym2 sym3 ...
                  +------------------

then the contents of tokens, the postfix stack, the evaluation stack,
and the pattern stack would be:

                  +----------------------
    tokens------->|sym1\0sym2\0sym3\0 ...
                  +----------------------
                   ^     ^     ^
                   |     |     |
                   +-----+-----+-----------------------------+<--------------------------------------------+
                         |     |                             ^                                             |
                         |     |                             |                                             |
                         |     +--------------------------+<-+------------------------------------------+  |
                         |                                ^  |                                          |  |
                         |                                |  |                                          |  |
                         +-----------------------------+<-+--+---------------------------------------+  |  |
                                                       ^  |  |                                       |  |  |
                                                       |  |  |                                       |  |  |
    eval_stack-----------------------------------------+--+--+---------------------------------------+--+--+--+
                                                       |  |  |                                       |  |  |  |
                                                       |  |  |                                       |  |  |  |
    pattern_stack--------------------------------------+--+--+--+                                    |  |  |  |
                                                       |  |  |  |                                    |  |  |  |
                                                       |  |  |  |                                    |  |  |  |
    posfix_stack->typedef struct symbol_element        |  |  |  |                                    |  |  |  |
                  {                                    |  |  |  |                                    |  |  |  |
                      char *lexicon;-------------------+  |  |  |                                    |  |  |  |
                      enum token_type precedence;         |  |  |                                    |  |  |  |
               +------struct symbol_element *next;        |  |  |                                    |  |  |  |
               |      struct eval_element *eval;----------+--+--+------------------------------------+--+--+--+->typedef struct eval_element
               |      struct bmhpattern_struct *pattern;--+--+--+->typedef struct bmhpattern_struct  |  |  |  +->{
               |  } ELEMENT;                              |  |  +->{                                 |  |  |         int value;
               |                                          |  |         unsigned char *pattern;-------+  |  |         PTF function;
               +->typedef struct symbol_element           |  |         int *table;                      |  |  +------struct eval_element *next;
                  {                                       |  |         int length;                      |  |  |  } EVAL;
                      char *lexicon;----------------------+  |         int count;                       |  |  |
                      enum token_type precedence;            |  +------struct bmhpattern_struct *next;  |  |  |
               +------struct symbol_element *next;           |  |  } BMHPATTERN;                        |  |  |
               |      struct eval_element *eval;-------------+--+---------------------------------------+--+--+->typedef struct eval_element
               |      struct bmhpattern_struct *pattern;-----+--+->typedef struct bmhpattern_struct     |  |  +->{
               |  } ELEMENT;                                 |  +->{                                    |  |         int value;
               |                                             |         unsigned char *pattern;----------+  |         PTF function;
               +->typedef struct symbol_element              |         int *table;                         |  +------struct eval_element *next;
                  {                                          |         int length;                         |  |  } EVAL;
                      char *lexicon;-------------------------+         int count;                          |  |
                      enum token_type precedence;               +------struct bmhpattern_struct *next;     |  |
               +----- struct symbol_element *next;              |  } BMHPATTERN;                           |  |
               |      struct eval_element *eval;----------------+------------------------------------------+--+->typedef struct eval_element
               |      struct bmhpattern_struct *pattern;--------+->typedef struct bmhpattern_struct        |  +->{
               |  } ELEMENT;                                    +->{                                       |         int value;
               |                 .                                     unsigned char *pattern;-------------+         PTF function;
               |                 .                                     int *table;                            +------struct eval_element *next;
               |                 .                                     int length;                            |  } EVAL;
               |                 .                                     int count;                             |
               |                 .                              +------struct bmhpattern_struct *next;        |
               |                 .                              |  } BMHPATTERN;                              |
               |                 .                              |         .                                   |
               .                 .                              .         .                                   .
               .                 .                              .         .                                   .
               .                 .                              .         .                                   .

where the precedence element, in each ELEMENT structure, is set to the
appropriate value of the referenced symbol-the order of the
postfix_stack elements is in forward postfix order, eg., the first
ELEMENT structure should be evaluated first, the second next, and so
on; the BMHPATTERN element, table, references the
Boyer-Moore-Horspool-Sunday jump table, and the element, length, is
the length of the search pattern; the EVAL element, function, is a
reference to the corresponding function that will be used to evaluate
the symbol's value of relevance

The pattern stack is used by the function bmhsearch_list (), and each
element in the pattern stack, referenced by the token pattern_stack,
will be used, sequentially, as a search element in bmhsearch ()

The evaluation stack is used by the function postfix_eval (), and ech
element in the evaluation stack, referenced by the token eval_stack,
will be used, sequentially, to determine the relevance of the search

The Boyer-Moore-Horspool-Sunday jump table is compiled for each
pattern in the pattern stack by bmhcompile_postfix (), which calls
bmhcompile () for each element in the stack

The search is performed, for each element in the pattern stack, by
bmhsearch_list (), which calls bmhseaerch() for each element in the
stack

For a detailed description of the Boyer-Moore-Horspool-Sunday search
algorithm, see "Information Retrieval: Data Structures & Algorithms,"
William B. Frakes, Ricardo Baeza-Yates, Editors, Prentice Hall,
Englewood Cliffs, New Jersey, 1992, ISBN 0-13-463837-9, pp 227.

To test this module, compile the module source with -DTEST_BMHSEARCH

$Revision: 1.1 $
$Date: 1996/09/13 13:47:23 $
$Id: bmhsearch.c,v 1.1 1996/09/13 13:47:23 john Exp $
$Log: bmhsearch.c,v $
Revision 1.1  1996/09/13 13:47:23  john
Added handling of circularly linked directories and subdirectories in searchpath.c
Cosmetic changes to bmhsearch.c, postfix.c, rel.c, searchfile.c, translit.c, uppercase.c, version.c.

 * Revision 1.0  1995/04/22  05:13:18  john
 * Initial revision
 *

*/

#include "rel.h"

#ifndef LINT /* include rcsid only if not running lint */

static char rcsid[] = "$Id: bmhsearch.c,v 1.1 1996/09/13 13:47:23 john Exp $"; /* module version */
static char rcsid_h[] = BMHSEARCH_H_ID; /* module include version */

#endif

#ifdef __STDC__

static BMHPATTERN *bmhcompile (unsigned char *pattern);
static int bmhsearch (unsigned char *page, int count, unsigned char *pattern, int size, int *table);

#else

static BMHPATTERN *bmhcompile ();
static int bmhsearch ();

#endif

/*

BMHPATTERN *bmhcompile_postfix (ELEMENT *postfix_list);

    allocate and compile the jump tables for each element in the
    postfix stack

The algorithm is as follows:

    for each element in the postfix stack

      if the element token is an identifier

      allocate and compile the jump table

Usage is a call with a reference to the postfix stack, for example:

    if ((pattern_stack = bmhcompile_postfix (postfix_stack)) == (BMHPATTERN *) 0)
    {
        bmh_error ();
    }

The single argument, postfix_list, is a reference to the postfix stack

Returns a reference to the pattern stack if successful, null if not

*/

#ifdef __STDC__

BMHPATTERN *bmhcompile_postfix (ELEMENT *postfix_list)

#else

BMHPATTERN *bmhcompile_postfix (postfix_list)
    ELEMENT *postfix_list;

#endif

{
    BMHPATTERN *retval = (BMHPATTERN *) 0; /* return value, reference to bmhpattern structure stack/list */

    ELEMENT *element = postfix_list; /* reference to element in postfix list */

    while (element != (ELEMENT *) 0) /* for each element in the postfix stack */
    {

        if (element->precedence == IDENTIFIER) /* element in postfix list a word? */
        {

            if ((element->pattern = bmhcompile (element->lexicon)) == (BMHPATTERN *) 0) /* allocate and compile the jump table */
            {
                retval = (BMHPATTERN *) 0; /* couldn't allocate and compile the jump table, set the error */
                break; /* and stop */
            }

            PUSH (retval, element->pattern); /* push the bmhpattern structure on the bmhpattern structure stack */
        }

        element = element->next; /* next element in the postfix list */
    }

    return (retval); /* return a reference to the bmhpattern, null if error */
}

/*

void bmhsearch_list (unsigned char *page, int count, BMHPATTERN *list)

    search the data area, reference by page, which is of size, count,
    for the patterns in the pattern list, reference by, list

The algorithm is as follows:

    for each element in the pattern stack

        search the data area for patterns that match the element

Usage is a call with a reference to the data area, the size of the
data area, and a reference to the pattern list, for example:

    bmhsearch_list (page, count, list);

The argument page is the data area to be searched, which is of size,
count, and list is a reference to the pattern list

Returns nothing

*/

#ifdef __STDC__

void bmhsearch_list (unsigned char *page, int count, BMHPATTERN *bmhlist)

#else

void bmhsearch_list (page, count, bmhlist)
    unsigned char *page;
    int count;
    BMHPATTERN *bmhlist;

#endif

{
    BMHPATTERN *element = bmhlist; /* reference to pattern list */

    while (element != (BMHPATTERN *) 0) /* for each element in the pattern stack */
    {
        element->count = bmhsearch (page, count, element->pattern, element->length, element->table); /* search the data area */
        element = element->next; /* next element in the pattern stack */
    }

}
/*

static BMHPATTERN *bmhcompile (unsigned char *pattern);

    allocate the BMHPATTERN structure, and allocate and compile the
    Boyer-Moore-Horspool-Sunday for the pattern

The algorithm is as follows:

    allocate the BMHPATTERN structure

    allocate the Boyer-Moore-Horspool-Sunday jump table for a size of
    MAX_ALPHABET_SIZE

    for each element of the jump table

        the jump value is the next character after the length of the
        search pattern

    for each character in the pattern

        the jump value of that character is the length of the pattern
        - the character position

Usage is a a call with a reference to the search pattern, for example:

    if ((pattern_ref = bmhcompile (pattern)) == (BMHPATTERN *) 0)
    {
        pattern_error ();
    }

The argument pattern is a reference to the pattern to be searched for

Returns a reference to the BMHPATTERN structure if successful, null
if not

*/

#ifdef __STDC__

static BMHPATTERN *bmhcompile (unsigned char *pattern)

#else

static BMHPATTERN *bmhcompile (pattern)
    unsigned char *pattern;

#endif

{
    int error_flag = URMEM_ERR, /* assume error allocating memory */
        m, /* length of pattern */
        k, /* "character" counter */
        *d; /* reference to jump table */

    BMHPATTERN *retval; /* return value */

    if ((retval = (BMHPATTERN *) memalloc (sizeof (BMHPATTERN))) != (BMHPATTERN *) 0) /* allocate the BMHPATTERN structure */
    {
        retval->table = (int *) 0; /* initialize the table refererence to null */
        retval->pattern = pattern; /* reference the search pattern */
        m = retval->length = (int) strlen ((char *) pattern); /* get the length of the pattern */
        retval->count = 0; /* assume no matches, yet */

        if ((d = retval->table = (int *) memalloc (MAX_ALPHABET_SIZE * sizeof (int))) != (int *) 0) /* allocate the jump table */
        {
            error_flag = NO_ERROR; /* assume no error */

            for (k = 0; k < MAX_ALPHABET_SIZE; k++) /* for each element of the jump table */
            {
                d[k] = m + 1; /* the jump value is the next character after the length of the search pattern */
            }

            for (k = 0; k < m; k++) /* for each character in the pattern */
            {
                d[(int) pattern[k]] = m - k; /* the jump value of that character is the length of the pattern - the character position */
            }

        }

    }

    if (error_flag != NO_ERROR) /* pending error? */
    {
        message (error_flag, (char *) 0); /* yes, print the error */
        retval = (BMHPATTERN *) 0; /* set the return value to error */
    }

    return (retval); /* return a reference to the structure, null if error */
}

/*

static int bmhsearch (unsigned char *page, int count, unsigned char *pattern, int size, int *table)

    search for a match of pattern, which is of size, size, in the data
    area, page, which is of size count, using the jump table, table

The algorithm is as follows:

    for each character, move forward the value in the jump table

        starting with that character

            for each character starting there where there is a match
            count the characters that match

            if the length of the matched area is the same as the
            pattern size

                increment the count of matches

Usage is a a call with a reference to the search pattern, for example:

    number = bmhsearch (page, count, pattern, length, table);

The argument, page, is the data area to be searched, and is of length,
count, the argument pattern is a reference to the pattern to be
searched for, and if of length, length, and the jump tale is reference
by, table, which was compiled by bmhcompile()

Note: It is necessary to force the character compare loop to break at
the end of the data area, page. The trailing EOS, '\0', of the pattern
and the trailing ' ' of the data area to be searched, page, will
provide a character mis-match, and break the loop, if the last word of
the data area and the pattern match. It is the responsibility of the
calling routine to arrange this scenario. This saves a test in the
loop. The length of the data area, count, should not include this
trailing space.

Returns the number of matches found

*/

#ifdef __STDC__

static int bmhsearch (unsigned char *page, int count, unsigned char *pattern, int size, int *table)

#else

static int bmhsearch (page, count, pattern, size, table)
    unsigned char *page;
    int count;
    unsigned char *pattern;
    int size;
    int *table;

#endif

{
    int i, /* character counter from pattern */
        j, /* character counter in search area */
        k, /* character counter for begining of search */
        lim, /* number of characters to be searched */
        retval = 0; /* return value = count of matches, assume no matches */

    lim = count - size + 1; /* number of characters to be searched is the search area size - length of the pattern */

    for (k = 0; k < lim; k += table[(int) page[k + size]]) /* for each character, move forward the value in the jump table */
    {
        i = k; /* starting with that character */

        for (j = 0; page[i] == pattern[j]; j++) /* for each character starting there where there is a match */
        {
            i++; /* next character */
        }

        if (j == size) /* if the length of the matched area is the same as the pattern size */
        {
            retval++; /* a match, increment the count of matches */
        }

    }

    return (retval); /* return the count of matches */
}

#ifdef TEST_BMHSEARCH

/*

simple exerciser for testing bmhcompile () and bmhsearch; the first
argument is the pattern to be searched for, the second argument is the
data area to be searched

NOTE: A TRAILING SPACE MUST BE ADDED TO THE DATA FOR THE TEST TO WORK
PROPERLY

*/

#ifdef __STDC__

int main (int argc, char *argv[])

#else

int main (argc, argv)
    int argc;
    char *argv[];

#endif

{
    int number; /* number of matches */

    BMHPATTERN *pattern; /* the pattern structure */

    if (argc != 3) /* enough arguments? */
    {
        (void) printf ("usage: %s pattern data\n", argv[0]); /* no, print the error */
        exit (1); /* and exit */
    }

    if ((pattern = bmhcompile ((unsigned char *) argv[1])) == (BMHPATTERN *) 0) /* allocate and compile the pattern structure */
    {
        (void) fprintf (stderr, "error allocating the pattern structure\n"); /* couldn't allocate the pattern structure */
        exit (1); /* exit */
    }

    /* search for the pattern, not counting the trailing space */

    number = bmhsearch ((unsigned char *) argv[2], strlen (argv[2]) - 1, pattern->pattern, pattern->length, pattern->table);
    (void) printf ("%d\n", number); /* print the number of matches */
    exit (0); /* return success */

#ifdef LINT /* include only if running lint */

    return (0); /* for LINT formality */

#endif

}

#endif
