#!/bin/sh
#
# Convert a column data to soundex codes.
#

RCS_ID='$Id: nsq-soundex,v 1.1 1998/05/29 20:44:47 carlos Exp $'

my_name=$(basename $0)

while [ $# -ge 1 ] ; do
  case $1 in
    -h*) cat <<_EOH_

        NoSQL operator: ${my_name}

Usage:  ${my_name}  [options] column [rdbtable]

Options:
    -h    Print this help info.
    -n    Strip header from output.

Prints a code based on the contents of 'column' of an rdbtable.

This operator reads an rdbtable from a file and prints a new table to
STDOUT. The new table is the input table with an added leading column
(the 'soundex' column). The new column contains the soundex code of the
string in 'column'. The new column name is the same as 'column', with
an "s" prepended to it. For instance, if the requested column is 'Name',
then the soundex column name will be 'sName'. The type/length of the
soundex column is always equal to '4S'.

If no rdbtable is specified on the command line, then the input table is
read from STDIN.

$RCS_ID

            ----------------------
NoSQL RDBMS, Copyright (C) 1998 Carlo Strozzi.
This program comes with ABSOLUTELY NO WARRANTY; for details
refer to the GNU General Public License.

You should have received a copy of the GNU General Public License
along with this program;  if not, write to the Free Software
Foundation, Inc., 59 Temple Place Suite 330, Boston, MA 02111-1307
USA.
            ----------------------

_EOH_
        exit 0
        ;;
    -n)    shift; no_hdr=1 ;;
    *)
	  if [ -z "${c_name}" ]
	  then
		c_name=$1; shift
	  else
		break
	  fi
	;;
  esac
done

# Check for correctness of command line arguments.

case ${c_name} in
  -*|"")
    echo "Usage: ${my_name} [options] column [rdbtable]" >&2
    exit 1
    ;;
esac

case ${1} in
  -*)
    echo "Usage: ${my_name} [options] column [rdbtable]" >&2
    exit 2
    ;;
esac

tmp_1=${TMPDIR:-/tmp}/${my_name}.t1.$$
tmp_2=${TMPDIR:-/tmp}/${my_name}.t2.$$

trap "rm -f ${tmp_1} ${tmp_2}" 0 1 2 3 11 13 15

cat $1 | tee ${tmp_1} | nsq-fcol -n '$P["'"${c_name}"'"]' |
	soundex > ${tmp_2}

awk 'BEGIN { NULL=""; FS="\t"; OFS=FS; }
# Table comments.
r==0 && $0 ~ /^ *#/ {
  if( !'"${no_hdr:-0}"' ) print
  next
}
# Column names.
r==0 { 
  while( ++p <= NF ) {
	if( $p == "'"${c_name}"'" ) {
	  c_width = length($p) + 1
	  if( c_width < 4 ) c_width = 4
	  break
    }
  }
  if( !'"${no_hdr:-0}"' ) print "s'"${c_name}"'", $0
  r++; next;
}
# Column definitions.
r == 1 { NR=0;
  if( !'"${no_hdr:-0}"' ) print c_width "S", $0
  r++; next;
}
{
  getline soundex < "'"${tmp_2}"'"
  print soundex, $0
}' ${tmp_1}

exit $?

