/*
 * Jeffrey Friedl
 * Omron Corporation			ʳ
 * Nagaokakyoshi, Japan			617Ĺ
 *
 * jfriedl@nff.ncl.omron.co.jp
 *
 * This work is placed under the terms of the GNU General Purpose License
 * (the "GNU Copyleft").
 *
 * October 1993
 *
 * See comment in index.h for general info about this stuff.
 */

#include "config.h"
#include "system.h"
#include "assert.h"
#include "system.h"
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/file.h>
#ifndef O_CREATE
# if defined(_HAVE_SYS_FCNTL_H_)
#   include <sys/fcntl.h>
# elif defined(_HAVE_FCNTL_H_)
#   include <fcntl.h>
# endif
#endif

#include "output.h"
#include "index.h"
#include "xmalloc.h"	/* for xmalloc () */
#include <ctype.h>

static unsigned line_count;

/*
 * For each character (which is a HI/LO pair) of each line in all
 * FILESIZE bytes of TEXT, call enter(HI, LO, start-of-line-value).
 * PER_LINE will be done once per text line.
 */
static void
SCAN_TEXT(VirtFile *v,
	  void (*enter)(unsigned char hi, unsigned char lo, TextOffset val),
	  unsigned flags,
	  const char *msg)
{
    unsigned lastpercent = 0; /* for reporting the progress */
    fileloc pos = 0, end = v->length;

    while (pos < end)
    {
	/* "value" is what will eventually be stored in the per-char list */
	TextOffset value = pos;
	const unsigned char *strptr, *strend;
	unsigned linelen;

	/*
        if (flags & INDEX_REPORT_PROGRESS)
	*/
	{
	    /*
	     * Given that X of Y has been done, print what percent has been
	     * done. The temp variable T is used to hold the last value
	     * printed.
	     */
	    int percent = (100 * pos)/end;
	    if (percent != lastpercent) {
		outputf("[%s] %s %02d%% \r", v->filename, msg,
			lastpercent = percent);
		flush_output();
	    }
	}

        if (strptr = VirtPos2Str(v, pos, &linelen), strptr == NULL)
	    break;
	line_count++;
	strend = strptr + linelen;
	pos += linelen + 1; /* +1 for newline */

	/* for each character (single or multibyte) in the line... */
	while (strptr < strend)
	{
	    unsigned char c = *strptr++;

            if (c == 0245) {
		/* a multibyte katakana: translate to hiragana */
                enter(0244&0x7f, *(strptr++) & 0x7f, value);
	    } else if (c & 0x80) {
		/* any other multibyte */
                enter(c & 0x7f, *(strptr++) & 0x7f, value);
	    } else if (isalnum(c)) {
		/* a regular (ASCII) alphabetic or numeric */
		enter(0, isupper(c) ? tolower(c) : c, value);
	    }
	}
    }
}

/*
 * Before creating the real index, we'll use a temporary full
 * (i.e. non-sparse) HI/LO array index to compute some things,
 * such as how we'll sparse-ize the real index.
 */
struct fullindex {
    struct fullindex_entry {
	unsigned count;
	unsigned count2;
	TextOffset lastentered;
	int mem_needed;
	unsigned char *listptr;
    } char_info[/*high 7-bit byte*/128][/*low 7-bit byte*/128];
} *fullindex;

static void
enter1(unsigned char hi, unsigned char lo, TextOffset data)
{
    struct fullindex_entry *p = &fullindex->char_info[hi][lo];
    unsigned diff;

    if (p->count == 0)
	diff = data;
    else if (p->lastentered == data)
	return; /* been there, done that */
    else
	diff = data - p->lastentered;

    p->mem_needed += bytes_required_for_packed_value(diff);
    p->lastentered = data;
    if (p->count != MAX_COUNT)
	if (p->count++ == (unsigned)~0)
	    die("count overflow at %s line %d.\n", __FILE__, __LINE__);
}


static void
enter2(unsigned char hi, unsigned char lo, TextOffset data)
{
    struct fullindex_entry *p = &fullindex->char_info[hi][lo];
    unsigned diff;

    /* don't enter for lines we're skipping */
    if (p->count == SKIPPED_COUNT)
	return;

    if (p->count2 == 0)
	diff = data;
    else if (p->lastentered == data)
	return; /* been there, done that */
    else {
	kibishii_assert(DATA > p->lastentered);
	diff = data - p->lastentered;
    }

    kibishii_assert(p->count != 0);
    kibishii_assert(p->listptr != 0);

    /* outputf("char %c%c: output %d [%d -> %d]\n",
	    hi, lo, diff, p->lastentered, data); /**/

    p->lastentered = data;
    /* note that we've seen this char and used its memory */
    p->count--;
    p->count2++;
    p->mem_needed -= write_packed_value(&p->listptr, diff);
    assert(p->mem_needed >= 0);
}

/*
 * create_index(V, PER, LC)
 * Create an index for the virtual file.
 * Characters that are on at least PER percent of the lines in
 * the file are omitted from the index.
 */
struct index *
create_index(VirtFile *v, unsigned percent, unsigned flags)
{
    unsigned hi, lo;		/* general usage for accessing index */
    struct index index;		/* real index header */
    struct index *indexp;       /* fully allocated real index pointer */
    unsigned char *freemem;     /* pointer into indexp of unpartitioned mem */

    /* allocate and clear memory for fullindex -- freed at end of this fcn */
    fullindex = (void*)xmalloc(sizeof(*fullindex));
    bzero((void*)fullindex, sizeof(*fullindex));

    /*
     * Do the first scan of the text, noting how many
     * lines each character is on, and how much memory its index will need.
     */
    line_count = 0;
    SCAN_TEXT(v, enter1, flags, "index (first pass): ");
    index.linecount = line_count;

    /* figure the line limiter */
    index.limitcount = index.linecount * percent / 100;
    if (index.limitcount > MAX_COUNT)
        index.limitcount = MAX_COUNT;

    /*
     * Will run through the fullindex we created and note, for each
     * character seen (i.e. for each hi/lo combo there) how much memory
     * in the index we'll need to represent it (if it's not omitted because
     * it's on too many lines).
     */
    index.indexsize = sizeof(index); /* will at least need index head */

    for (hi = 0x00; hi < 0x80; hi++) /* for every possible HI byte.... */
    {
	/* look for the first LO with entries in it */
	for (lo = 0; lo < 0x80; lo++)
	    if (fullindex->char_info[hi][lo].count != 0)
		break;

	/* no memory needed if there are none */
	if (lo >= 0x80) {
	    index.hi[hi].first_lo = index.hi[hi].end_lo = 0;
	    continue;
	}

	/* now go through the rest of the LOs, noting the last we've seen */
	for (index.hi[hi].first_lo = lo; lo < 0x80; lo++)
	{
	    unsigned count = fullindex->char_info[hi][lo].count;
	    if (count == 0)
		continue;

	    /* note that we've seen a LO at least this late in the game */
	    index.hi[hi].end_lo = lo;

	    if (count < index.limitcount) {
		if (count != 0)
		{
		    if (flags & INDEX_REPORT_STATS)
		    {
			outputf("%d times [%c%c] %d bytes]\n", count,
			       (hi ? (hi|0x80) : ' '), (hi ? (lo|0x80) : lo),
			       fullindex->char_info[hi][lo].mem_needed);
		    }
		    index.indexsize += fullindex->char_info[hi][lo].mem_needed;
		}
	    } else {
		/* on too many lines... we'll omit this from the index */
		if (flags & INDEX_REPORT_SKIPPED)
		   outputf("[%c%c:%d/%d]",
			 (hi ? (hi|0x80) : ' '), (hi ? (lo|0x80) : lo),
			  count, fullindex->char_info[hi][lo].mem_needed);
	/*index.omittedsize += fullindex->char_info[hi][lo].mem_needed;*/
		fullindex->char_info[hi][lo].count = SKIPPED_COUNT;
		fullindex->char_info[hi][lo].mem_needed = 0;
	    }
	}

	index.hi[hi].end_lo++; /* now points just beyond last char*/
	
	/* must also account for the lo_count[] and lo[] arrays */
	index.indexsize += (index.hi[hi].end_lo - index.hi[hi].first_lo)
	             * (sizeof(elementcount) + sizeof(IndexOffset));
    }
    if (flags & INDEX_REPORT_SKIPPED)
	outchar('\n');

    indexp = xmalloc(index.indexsize);	   /* allocate memory for real index */
    *indexp = index;			   /* copy partially filled header */
    freemem = (unsigned char *)&indexp[1]; /* point to free data after head */

    /* go into the real index to create the list array holders */
    for (hi = 0x00; hi < 0x80; hi++)
    {
	if (index.hi[hi].end_lo != 0) {
	    unsigned count = index.hi[hi].end_lo - index.hi[hi].first_lo;
	    indexp->hi[hi].shifted_lo = makeIndexOffset(indexp, freemem);
	    freemem += sizeof(IndexOffset) * count;
	}
    }

    /* go into the real index to partition the count array holders */
    for (hi = 0x00; hi < 0x80; hi++)
    {
	if (index.hi[hi].end_lo != 0) {
	    unsigned count = index.hi[hi].end_lo - index.hi[hi].first_lo;
	    indexp->hi[hi].listcount = makeIndexOffset(indexp, freemem);
	    freemem += sizeof(elementcount) * count;
	}
    }

    /* go into the index to partition the list memories */
    for (hi = 0x00; hi < 0x80; hi++)
    {
	IndexOffset thisCountPtr;
	IndexOffset thisListPtr;

	if (index.hi[hi].end_lo == 0)
	    continue; /* no lists needed here */

	thisCountPtr = indexp->hi[hi].listcount;
	thisListPtr  = indexp->hi[hi].shifted_lo;

	/* for each LO that exists for this HI.... */
	for (lo = index.hi[hi].first_lo; lo < index.hi[hi].end_lo; lo++)
	{
	    /* insert count for this HI/LO  */
	    elementcount count = fullindex->char_info[hi][lo].count;
	    *realptr(indexp, thisCountPtr, elementcount *) = count;


	    /* partition memory for the list if there's a list for this pair */
	    if (count && count != SKIPPED_COUNT) {
		*realptr(indexp, thisListPtr, IndexOffset *) =
		    makeIndexOffset(indexp, freemem);
		fullindex->char_info[hi][lo].listptr = freemem;
		freemem += fullindex->char_info[hi][lo].mem_needed;
	    }

	    /* bump up count and listptr pointers for next LO */
	    thisListPtr  += sizeof(IndexOffset);
	    thisCountPtr += sizeof(elementcount);

	    /* clear this for the next runthrough */
	    fullindex->char_info[hi][lo].lastentered = 0;
	}
    }

    /* make sure it came out exactly right */
    kibishii_assert(makeIndexOffset(indexp, freemem) == index.indexsize);

    /*
     * Run through text a 2nd time, actually creating the real index.
     * This is virtually identical to the loop at the top of this function.
     */

    SCAN_TEXT(v, enter2, flags, "index (final pass): ");

/*    if (flags & INDEX_REPORT_PROGRESS) */
	output("                                            \r");

    free(fullindex);
    indexp->magic = INDEX_MAGIC;
    indexp->version_major = INDEX_VERSION_MAJOR;
    indexp->version_minor = INDEX_VERSION_MINOR;
    return indexp;
}

int write_index_file(const char *filename, const struct index *i)
{
    int fd;
    int iserror;

    if (fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0444), fd < 0)
	return fd;
    iserror = write(fd, (void*)i, i->indexsize) != i->indexsize;
    iserror |= close(fd) != 0;

    return iserror;
}

/*
 * If TRY is true, issue no message if file not found.
 */
struct index *read_index_file(const char *filename, int try, unsigned flags)
{
    struct index *index = 0;
    long int size = filesize(filename);

    if (size < 0 && try)
	return 0;
    if (size >= 0) {
	int fd = open(filename, 0);
	index = xmalloc(size);
	read(fd, (void *)index, size);
	close(fd);
    }

    if (index == 0)
    {
	if (!try)
	    outputf("[open of \"%s\" failed: %n]\n", filename);
	return 0;
    }

    index->FileP = NULL;

    if (size != index->indexsize) {
	warn("<warning, index seems corrupt: size is %ld, index says %d>\n",
		size, index->indexsize);
    } else if (index->magic != INDEX_MAGIC) {
	warn("<warning, index file magic is wrong>\n");
    } else if (index->version_major != INDEX_VERSION_MAJOR) {
	warn("<warning, index version major is wrong (%d != %d)>\n",
		index->version_major, INDEX_VERSION_MAJOR);
    } else if (index->version_minor != INDEX_VERSION_MINOR) {
	warn("<warning, index version minor is wrong (%d != %d)>\n",
		index->version_minor, INDEX_VERSION_MINOR);
    }
    return index;
}

struct index *mem_read_index_file(const char *filename)
{
    FILE *fp;
    struct index *index;

    if (fp = fopen(filename, "r"), fp == NULL)
	return NULL;

    index = xmalloc(sizeof(struct index));
    if (fread(index, sizeof(struct index), 1, fp) != 1)
	die("bad fread of index header: %n\n");

    index->FileP = fp;

    if (index->magic != INDEX_MAGIC) {
	warn("<warning, index file magic is wrong>\n");
    } else if (index->version_major != INDEX_VERSION_MAJOR) {
	warn("<warning, index version major is wrong (%d != %d)>\n",
		index->version_major, INDEX_VERSION_MAJOR);
    } else if (index->version_minor != INDEX_VERSION_MINOR) {
	warn("<warning, index version minor is wrong (%d != %d)>\n",
		index->version_minor, INDEX_VERSION_MINOR);
    }
    return index;
}

/*
 * Return true if the named file seems to be an index file.
 */
int is_index_file(const char *filename)
{
    int fd;
    struct index head;
    struct stat statbuf;
    int i = 0;

    if (fd = open(filename, 0), fd < 0)
	return 0;

    if (fstat(fd, &statbuf))
	i = read(fd, &head, sizeof(head));

    close(fd);

    if (i != sizeof(head))
	return 0;

    if (head.magic != INDEX_MAGIC)
	return 0;

    if (head.indexsize != statbuf.st_size)
	return 0;

    return 1;
}
