#!/bin/sh
#
# bayes-10pcv-driver - run 10-fold cross-validation test on SpamAssassin Bayes
#
# Since Bayesish probability analysis requires training on a corpus, the
# traditional SpamAssassin 10-pass cross-validation suite can't be used.  Also,
# Bayes requires its own ten-pass testing, separately, to judge the effects of
# tweaks.  So that's what this is.
#
# Before running, you need to create a test corpus, as "cor/spam" and
# "cor/ham".  Here's how to do this:

#   cd TEST
#   SADIR/tools/split_corpora -n 10 -l 2000 -p cor/spam/bucket spf1 spf2 spf3 ...
#   SADIR/tools/split_corpora -n 10 -l 2000 -p cor/ham/bucket ham1 ham2 ham3 ...
#
# SADIR = top-level directory of SpamAssassin distro
# TEST  = the directory where the corpus and results are to be written
# spfN   = mail folders full of spam
# hamN   = mail folders full of ham

# It will produce a directory of results called "results".  The most important
# are "hist_all": a histogram of scores and frequencies, and "thresholds_all":
# the output of analysis of all scores and frequencies from the
# bayes-thresholds script.

# NOTE: by default you will need *AT LEAST* 2000 of either type to use
# this, since bayes will not be activated without 200 messages in the db,
# and each fold is run using 10% of the corpus -- and 2000/10 = 200.

###########################################################################

testdir=`pwd`
numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1`

learnargs=
if [ "$#" -gt 0 ] ; then
  learnargs="$*"
fi

cd $SADIR/masses

results=$testdir/results
tmpdir=$results/config

rm -rf $results $tmpdir

# now, just copy in the Bayes ruleset
mkdir -p $results $tmpdir/rules
cp ../rules/20_aux_tlds.cf $tmpdir/rules
cp ../rules/23_bayes.cf $tmpdir/rules
cp ../rules/50_scores.cf $tmpdir/rules
cp ../rules/*.pre $tmpdir/rules         # ensure we have plugins
cp $testdir/*.pre $tmpdir/rules 2>/dev/null
cp $testdir/*.cf $tmpdir/rules 2>/dev/null

# tell SpamAssassin to use this path for DBs
# TODO: for tests of these settings, read from a test-specific file
echo "

bayes_path                $tmpdir/dbs/bayes
bayes_auto_learn          0
bayes_min_ham_num         10
bayes_min_spam_num        10
bayes_store_module Mail::SpamAssassin::BayesStore::SDBM
lock_method flock

" > $tmpdir/rules/30bayes_path.cf
mkdir $tmpdir/dbs

INTERLEAVE_TESTS=0
TEST_AGAINST_10PC=0
LEARN_ALL_THEN_FORGET_TEST_SET=0

backup_dbs () {
  echo "Backing up full learned DBs..."
  ( cd $tmpdir; tar cvf learned-all.tar dbs )
}
restore_dbs () {
  echo "Restoring full learned DBs..."
  ( cd $tmpdir; rm -rf dbs; tar xf learned-all.tar )
}
runcmd () {
  echo "$*"
  time $*
}

if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then

  # learn the lot, then forget the ones we're testing on each time.
  # faster than learning from scratch for each fold

  # note: we use randseed=1 so that every run will always pick the
  # same messages if --learnprob is used.

  (
  echo -n "Learning from all ham buckets..." ; date
  runcmd ../sa-learn --ham --randseed=1 --no-sync $learnargs \
	  --showdots --mbox --siteconfigpath=$tmpdir/rules --config-file=$tmpdir/rules $testdir/cor/ham/*

  echo -n "Learning from all spam buckets..." ; date
  runcmd ../sa-learn --spam --randseed=1 --no-sync $learnargs \
	  --showdots --mbox --siteconfigpath=$tmpdir/rules --config-file=$tmpdir/rules $testdir/cor/spam/*

  runcmd ../sa-learn --sync $learnargs --siteconfigpath=$tmpdir/rules --config-file=$tmpdir/rules

  echo -n "Done learning. " ; date
  ) 2>&1 | tee $results/learn.log

  echo "Dumping bayes DB..."
  ../sa-learn --dump --dbpath=$tmpdir/dbs/bayes > $results/bayes_db.dump

fi

backup_dbs

(

echo -n "Starting test..." ; date
for bucket in 1 2 3 4 5 6 7 8 9 10 ; do
  echo -n "Bucket $bucket..." ; date

  if [ $bucket != 1 ] ; then restore_dbs ; fi

  rdir=$results/bucket$bucket
  mkdir $rdir

  : > $rdir/hbucketlearn
  : > $rdir/sbucketlearn
  : > $rdir/hbuckettest
  : > $rdir/sbuckettest
  for subbucket in 1 2 3 4 5 6 7 8 9 10 ; do
    type=l
    [ $TEST_AGAINST_10PC = 1 -a $subbucket = $bucket ] && type=t
    [ $TEST_AGAINST_10PC = 0 -a $subbucket != $bucket ] && type=t

    if [ $type = l ] ; then
      echo "Using bucket for learn: $subbucket ..."
      cat $testdir/cor/ham/bucket.$subbucket >> $rdir/hbucketlearn
      cat $testdir/cor/spam/bucket.$subbucket >> $rdir/sbucketlearn
    else
      echo "Using bucket for test: $subbucket ..."
      cat $testdir/cor/ham/bucket.$subbucket >> $rdir/hbuckettest
      cat $testdir/cor/spam/bucket.$subbucket >> $rdir/sbuckettest
    fi
  done

  if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then
    echo "Forgetting contents of test ham bucket..."
    runcmd ../sa-learn --forget --siteconfigpath=$tmpdir/rules --config-file=$tmpdir/rules --showdots \
			  --mbox $rdir/hbuckettest

    echo "Forgetting contents of test spam bucket..."
    runcmd ../sa-learn --forget --siteconfigpath=$tmpdir/rules --config-file=$tmpdir/rules --showdots \
			  --mbox $rdir/sbuckettest

  else
    echo "Learning contents of learn ham bucket..."
    runcmd ../sa-learn --ham --randseed=1 --no-sync $learnargs \
	    --showdots --mbox --siteconfigpath=$tmpdir/rules --config-file=$tmpdir/rules $rdir/hbucketlearn

    echo "Learning contents of learn spam bucket..."
    runcmd ../sa-learn --spam --randseed=1 --no-sync $learnargs \
	    --showdots --mbox --siteconfigpath=$tmpdir/rules --config-file=$tmpdir/rules $rdir/sbucketlearn

    runcmd ../sa-learn --sync $learnargs --siteconfigpath=$tmpdir/rules --config-file=$tmpdir/rules

    echo "Dumping bayes DB..."
    ../sa-learn --dump --dbpath=$tmpdir/dbs/bayes > $rdir/bayes_db.dump
  fi

  runcmd ../sa-learn --sync --siteconfigpath=$tmpdir/rules --config-file=$tmpdir/rules

  # take a copy of the trained Bayes DBs
  ( cd $tmpdir ; tar cf dbs.tar dbs )

  if [ $INTERLEAVE_TESTS = 1 ] ; then
    # now split the ham and spam test bucket into 10 sub-buckets,
    # so we interleave ham and spam while testing. important for
    # judging expiry effects
    : > $rdir/nonspam.log
    : > $rdir/spam.log
    
    mkdir $rdir/testbuckets
    ../tools/split_corpora -n 10 -p $rdir/testbuckets/ham \
			$rdir/hbuckettest
    ../tools/split_corpora -n 10 -p $rdir/testbuckets/spam \
			$rdir/sbuckettest

    for subbucket in 1 2 3 4 5 6 7 8 9 10 ; do
      echo "Running mass-check on ham test-bucket $subbucket..."
      time ./mass-check -j=$numcpus -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
	    --bayes --mbox $rdir/testbuckets/ham.$subbucket \
	    >> $rdir/nonspam.log

      echo "Running mass-check on spam test-bucket $subbucket..."
      time ./mass-check -j=$numcpus -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
	    --bayes --mbox $rdir/testbuckets/spam.$subbucket \
	    >> $rdir/spam.log
    done

  else
    echo "Running mass-check on ham bucket..."
    runcmd ./mass-check -j=$numcpus -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
	  --bayes --mbox $rdir/hbuckettest \
	  > $rdir/nonspam.log

    echo "Running mass-check on spam bucket..."
    runcmd ./mass-check -j=$numcpus -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
	  --bayes --mbox $rdir/sbuckettest \
	  > $rdir/spam.log
  fi

  echo "Reporting..."
  ./bayes-testing/draw-bayes-histogram \
	$rdir/spam.log $rdir/nonspam.log \
	> $rdir/hist

  ./bayes-testing/bayes-thresholds \
	$rdir/spam.log $rdir/nonspam.log \
	> $rdir/thresholds

  ./bayes-testing/bayes-static-thresholds \
	$rdir/spam.log $rdir/nonspam.log \
	> $rdir/thresholds.static

  # remove these, they're too big.
  rm -f $rdir/hbucketlearn $rdir/sbucketlearn

  # but keep these to find FPs/FNs later
  lz4 --rm -m $rdir/hbuckettest $rdir/sbuckettest || \
  gzip $rdir/hbuckettest $rdir/sbuckettest

done
echo -n "Done test..." ; date

) 2>&1 | tee $results/test.log

cat $results/bucket*/spam.log > $results/spam_all.log
cat $results/bucket*/nonspam.log > $results/nonspam_all.log

./bayes-testing/draw-bayes-histogram \
	$results/spam_all.log $results/nonspam_all.log \
	> $results/hist_all
./bayes-testing/bayes-thresholds \
	$results/spam_all.log $results/nonspam_all.log \
	> $results/thresholds_all
./bayes-testing/bayes-static-thresholds \
	$results/spam_all.log $results/nonspam_all.log \
	> $results/thresholds_all.static

echo "Done."
ls -l $results

