#!/local/perl586/bin/perl -w

# settings are located in $HOME/.corpus

use strict;
use Getopt::Long;

our ( $opt_override, $opt_tag );
GetOptions(
    "tag=s" => \$opt_tag,
    "override=s" => \$opt_override,
);

$opt_override ||= '';
$opt_tag ||= 'n';       # nightly is the default

use File::Path;
use File::Copy;
use Time::ParseDate;
use Cwd qw(abs_path);
use POSIX qw(nice strftime);

use constant WEEK => 7*60*60*24;
nice(15);

# daterevs -- e.g. "20060429/r239832-r" -- are aligned to 0800 UTC, just before
# the time of day when the mass-check tagging occurs; see
# http://wiki.apache.org/spamassassin/DateRev for more details.
use constant DATEREV_ADJ => - (8 * 60 * 60);

# what's the max age of mail we will accept data from? (in weeks)
# TODO: maybe this should be in ~/.corpus
my $OLDEST_HAM_WEEKS    = 72 * 4;       # 72 months = 6 years
my $OLDEST_SPAM_WEEKS    = 2 * 4;       # 2 months

# ---------------------------------------------------------------------------

sub runcmd;
my $configuration = "$ENV{HOME}/.corpus";
my %cf;
my %revision = ();
my %filesize = ();
my %dateline = ();
my %mtime = ();
my %logs_by_daterev = ();
my %is_net_daterev = ();
my %time = ();
my @tmps = ();
my $time_start = time;
my $output_revpath;
my $perl_path = $^X;

configure();
init();

my $logsdir = "$cf{html}/logs";
print "reading logs from '$logsdir'\n";

locate_input();
generate_logs();
clean_up();
exit;

# ---------------------------------------------------------------------------

sub configure {
  # does rough equivalent of source
  open(C, $configuration) || die "open failed: $configuration: $!\n";
  my $pwd = Cwd::getcwd;

  # add 'override' options
  my @lines = (<C>, split(/\|/, $opt_override));

  foreach (@lines) {
    chomp;
    s/#.*//;
    if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) {
      my ($key, $val) = ($1, $2);
      $val =~ s/\$PWD/$pwd/gs;
      $cf{$key} = $val;
    }
  }
  close(C);

  $cf{output_classes} ||=
            "DETAILS.new DETAILS.all DETAILS.age NET.new NET.all NET.age";
}

# ---------------------------------------------------------------------------

sub clean_up {
  chdir "/";
  runcmd "rm -rf $cf{tmp}/*.$$ ".join(' ', @tmps);
}

# ---------------------------------------------------------------------------

sub init {
  $SIG{INT} = \&clean_up;
  $SIG{TERM} = \&clean_up;

  $ENV{RSYNC_PASSWORD} = $cf{password};
  $ENV{TIME} = '%e,%U,%S';
  $ENV{TZ} = 'UTC';
}

# ---------------------------------------------------------------------------

sub locate_input {
  opendir(CORPUS, $logsdir);
  my @files = sort readdir(CORPUS);
  closedir(CORPUS);

  @files = grep {
    /^(?:spam|ham)-(?:net-)?\S+\.log$/ && -f "$logsdir/$_" && -M _ < 10 
  } @files;

  foreach my $file (@files) {
    my $tag = 0;
    my $headers = '';

    open(FILE, "$logsdir/$file") or warn "cannot read $logsdir/$file";
    while (my $line = <FILE>) {
      last if $line !~ /^#/;
      $headers .= $line;
      if ($line =~ /, on (... ... .. )(..)(:..:.. ... ....)/) {
        my ($datepre, $hh, $datepost) = ($1,$2,$3);
        
        my $timet = Time::ParseDate::parsedate($datepre.$hh.$datepost,
                    GMT => 1, PREFER_PAST => 1);

        $time{$file} = $timet;
      }
      elsif ($line =~ m/^# Date:\s*(\S+)/) {
        # a better way to do the above.  TODO: parse it instead
        $dateline{$file} = $1;
        if (!defined $time{$file}) {
          # if time line unparseable (localized?) use this instead
          my ($yyyy, $mm, $dd, $h, $m, $s) = $dateline{$file} =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/;
          
          my $timet = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} ${h}:${m}:${s} GMT+0",
                    GMT => 1, PREFER_PAST => 1);

          $time{$file} = $timet;
        }
      }
      elsif ($line =~ m/^# SVN revision:\s*(\S+)/) {
        $revision{$file} = $1;
      }
    }
    close(FILE);

    my @s = stat("$logsdir/$file");
    $filesize{$file} = $s[7];
    $mtime{$file} = $s[9];

    if (!defined $time{$file}) {
      warn "$logsdir/$file: no time found, ignored\n"; next;
    }
    if (!defined $revision{$file}) {
      warn "$logsdir/$file: no revision found, ignored\n"; next;
    }
    if ($revision{$file} eq 'unknown') {
      warn "$logsdir/$file: not tagged with a revision, ignored\n"; next;
    }

    my $daterev = mk_daterev($time{$file},$revision{$file},$opt_tag);

    $logs_by_daterev{$daterev} ||= [ ];
    push (@{$logs_by_daterev{$daterev}}, $file);

    if ($file =~ /-net-/) {
      $is_net_daterev{$daterev} = 1;
      print "$logsdir/$file: rev=$daterev time=$time{$file} (set 1)\n";
    }
    else {
      print "$logsdir/$file: rev=$daterev time=$time{$file} (set 0)\n";
    }

    get_rulemetadata_for_revision($daterev, $revision{$file});
  }
}

# ---------------------------------------------------------------------------

sub sort_all {
  my ($a1, $a2) = ($a =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);
  my ($b1, $b2) = ($b =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);
  $a1 =~ s/^[\+\-]//;
  $b1 =~ s/^[\+\-]//;

  my $n = ($a1 cmp $b1) || (($a2 || '') cmp ($b2 || ''));
  if ($a1 =~ /^OVERALL/)			{ $n -= 1000; }
  elsif ($a1 =~ /^\(all messages\)/)		{ $n -= 100; }
  elsif ($a1 =~ /^\(all messages as \%\)/)	{ $n -= 10; }
  if ($b1 =~ /^OVERALL/)			{ $n += 1000; }
  elsif ($b1 =~ /^\(all messages\)/)		{ $n += 100; }
  elsif ($b1 =~ /^\(all messages as \%\)/)	{ $n += 10; }
  return $n;
}

# ---------------------------------------------------------------------------

sub time_filter_fileset {
  my ($fileary, $outname, $after, $before) = @_;

  my $timet_before = (defined $before ? 
            ($time_start - ($before * WEEK)) : $time_start+1);
  my $timet_after  = (defined $after ? 
            ($time_start - ($after * WEEK))  : 0);

  open(TMP, ">$outname") or warn "cannot write $outname";
  for my $file (@{$fileary}) {
    open(IN, $file) or warn "cannot read $file";
    while (<IN>) {
      next unless /\btime=(\d+)/;
      next if ($1 < $timet_after || $1 > $timet_before);
      print TMP;
    }
    close IN;
  }
  close TMP or warn "failed to close $outname";
}

# ---------------------------------------------------------------------------

sub generate_logs {
  foreach my $entry (split(' ', $cf{output_classes})) {
    $entry =~ /^(\S+)\.(\S+)$/;
    my $class = $1;
    my $rtype = $2;
    if (!$rtype) { warn "no rtype in $entry"; next; }
    if ($class eq 'HTML') { warn "class HTML in $entry obsolete, ignored"; next; }

    foreach my $daterev (reverse sort keys %logs_by_daterev) {
      my $rev;
      if ($daterev !~ /\/r(\d+)/) {
        warn "bad daterev: $daterev"; next;
      }
      $rev = $1;

      if ($class eq "NET") {
        next unless $is_net_daterev{$daterev};
      }

      gen_class ($daterev, $rev, $class, $rtype);
    }
  }
} 

# ---------------------------------------------------------------------------

my ($tmp_h, $tmp_s, $no_messages_in_freqs, $hf_flags);

sub gen_class {
  my ($daterev, $rev, $class, $rtype) = @_;
  return if ($class eq "NET" && $rtype !~ /^(?:new|all|age|7day)$/);

  chdir $logsdir;
  print STDERR "\ngenerating: $cf{html}/$daterev/$class.$rtype\n";

  my @ham = grep { /^ham/ } @{$logs_by_daterev{$daterev}};
  print STDERR "input h: " . join(' ', @ham) . "\n";

  my @spam = grep { /^spam/ } @{$logs_by_daterev{$daterev}};
  print STDERR "input s: " . join(' ', @spam) . "\n";

  # net vs. local
  if ($class eq "NET") {
    @ham = grep { /-net-/ } @ham;
    @spam = grep { /-net-/ } @spam;
  }
  # age
  if ($rtype =~ /(\d+)day/) {
    my $mtime = $1;
    @ham = grep { -M $_ < $mtime } @ham;
    @spam = grep { -M $_ < $mtime } @spam;
  }

  print STDERR "selected h: " . join(' ', @ham) . "\n";
  print STDERR "selected s: " . join(' ', @spam) . "\n";
  
  # we cannot continue if we have no files that match the criteria...
  # demand at least 1 ham and 1 spam file
  if (scalar @spam <= 0 || scalar @ham <= 0) {
    warn "not enough files found matching criteria ($daterev $class $rtype)\n";
    return;
  }

  my $dir = create_outputdir($daterev);
  my $fname = "$dir/$class.$rtype";

  # now, if the target file already exists, check to see if it's newer
  # than all the sources, make-style; if not, don't re-create it
  if (-f $fname) {
    my $targetfreshness = (-M $fname);
    my $needsrebuild = 0;

    foreach my $srcfile (@spam, @ham) {
      my $srcfreshness = (-M $srcfile);
      if ($targetfreshness > $srcfreshness) {     # src is fresher
        print "need rebuild, $fname is older than $srcfile: $targetfreshness > $srcfreshness\n";
        $needsrebuild = 1;
        last;
      }
    }

    if (!$needsrebuild) {
      print "existing: $fname, fresher than sources\n";
      return;
    }
  }

  my $when = scalar localtime time;
  print qq{creating: $fname ($class)
  started $when...
};
  my $bytes = 0;

  my $tmpfname = "$fname.$$";
  if ($class eq 'LOGS') {
    $bytes = gen_report_logs($fname, \@ham, \@spam);
  }
  elsif ($class eq 'CORPUS') {
    push (@tmps, abs_path($tmpfname));

    my $cmd = "$perl_path $cf{tree}/masses/logs-to-corpus-report ".
          join(" ", @ham)." ".join(" ", @spam)." > $tmpfname";
    runcmd $cmd;
    ($? >> 8 == 0) or warn "failed to run logs-to-corpus-report";

    rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";
    $bytes = (-s $fname);
  }
  else {
    push (@tmps, abs_path($tmpfname));

    open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname";
    print OUT "# ham results used for $daterev $class $rtype: " . join(" ", @ham) . "\n";
    print OUT "# spam results used for $daterev $class $rtype: " . join(" ", @spam) . "\n";
    print OUT "# ".log_metadata_xml($daterev, @ham, @spam)."\n";

    $hf_flags = "";
    $hf_flags = "-t net -s 1" if $class eq "NET";
    $hf_flags = "-o" if $class eq "OVERLAP";
    $hf_flags = "-S" if $class eq "SCOREMAP";

    if ($cf{rules_dir}) {
      $hf_flags .= " -c '$cf{rules_dir}'";
    }

    # are we analyzing --net mass-check logs?  if so, use scoreset 1
    if (join(" ", @ham) =~ /-net-/) {
      $hf_flags .= " -s 1" if $class eq "NET";
    }

    # catch an odd error condition, where hit-frequencies creates output
    # with no log lines included at all
    $no_messages_in_freqs = 0;

    $tmp_h = "$cf{tmp}/ham.log.$$";
    $tmp_s = "$cf{tmp}/spam.log.$$";

    if ($rtype eq "all") {
      gen_report_freqs_all($tmpfname, \@ham, \@spam, $rev);
    }
    elsif ($rtype eq "age") {
      gen_report_freqs_age($tmpfname, \@ham, \@spam, $rev);
    }
    elsif (@ham && @spam) {
      gen_report_freqs_basic($tmpfname, \@ham, \@spam, $rev);
    }

    $bytes = (-s OUT);
    close(OUT);

    if ($no_messages_in_freqs) {
      warn "ERROR: no data in freqs!  aborting, leaving tmp file as $tmpfname";
      return;
    }

    rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";

    # compress for certain classes
    if ($class eq "OVERLAP") {
      $fname =~ s/'//gs;
      runcmd ("rm '$fname.gz'; gzip '$fname'"); 
      # takes care of keeping the original around so we don't have to
      if ($? >> 8 != 0) { warn "gzip '$fname' failed"; }
    }
  }

  $when = scalar localtime time;
  print qq{created: $bytes bytes, finished at $when
URL:

  $cf{ruleqa_url}$output_revpath

};

}

# ---------------------------------------------------------------------------

sub mk_daterev {
  my ($timet, $rev, $tag) = @_;
  return strftime("%Y%m%d", gmtime($timet + DATEREV_ADJ)) . "/r$rev-$tag";
}

# ---------------------------------------------------------------------------

sub create_outputdir {
  my ($revpath) = @_;
  my $dir = $cf{html} .'/'. $revpath;

  # print "output dir: $dir\n";
  if (!-d $dir) {
    my $prevu = umask 0;
    mkpath([$dir], 0, oct($cf{html_mode})) or warn "failed to mkdir $dir";
    umask $prevu;
  }

  $output_revpath = $revpath;       # set the global
  $output_revpath =~ s/\//-/;       # looks nicer

  return $dir;
}

# ---------------------------------------------------------------------------

sub log_metadata_xml {
  my ($daterev, @files) = @_;
  my $str = '';

  # this is extracted into the info.xml file later by the gen_info_xml script
  foreach my $f (@files) {
    $str .= qq{
      <mclogmd file='$f'>
        <daterev>$daterev</daterev>
        <rev>$revision{$f}</rev>
        <fsize>$filesize{$f}</fsize>
        <mcstartdate>$dateline{$f}</mcstartdate>
        <mtime>$mtime{$f}</mtime>
      </mclogmd>
    };
  }

  $str =~ s/\s+/ /gs;  # on a single line please
  return '<mclogmds>'.$str.'</mclogmds>';
}

# ---------------------------------------------------------------------------

sub create_rulemetadata_dir {
  my $rev = shift;
  my $dir = "$cf{html}/rulemetadata/$rev";
  if (!-d $dir) {
    my $prevu = umask 0;
    mkpath([$dir], 0, oct($cf{html_mode})) or warn "failed to mkdir $dir";
    umask $prevu;
  }
  return $dir;
}

# ---------------------------------------------------------------------------

sub get_rulemetadata_for_revision {
  my ($daterev, $rev) = @_;

  my $dir = create_rulemetadata_dir($rev);

  # argh.  this is silly; ~bbmass/.corpus specifies "$PWD" in its
  # "tree" path, so we have to ensure we're in the 'masses' dir
  # for this to work!
  chdir "$cf{tree}/masses" or die "cannot chdir $cf{tree}/masses";

  my $cmd = "$cf{tree}/masses/rule-qa/get-rulemetadata-for-revision ".
                    "--rev=$rev --outputdir='$dir'";

  runcmd($cmd);
  if ($? >> 8 != 0) {
    warn "'$cmd' failed";
  }

  chdir $logsdir;
}

# ---------------------------------------------------------------------------

sub start_hit_frequencies_at_rev {
  my ($rev, $args) = @_;

  $rev ||= 'HEAD';
  (-d "$cf{tmp}/hfdir") or runcmd("mkdir -p $cf{tmp}/hfdir");
  my $hfdir = "$cf{tmp}/hfdir/r$rev";
  my $expected_svn_file = "$hfdir/Makefile.PL";

  print "setting up hit-frequencies for r$rev in $hfdir\n";

  my $needs_checkout = 0;
  if (-d $hfdir && chdir $hfdir) {
    eval {
      # "svn up" has been observed to wedge on the ruleqa zone VM, put a timeout so we can recover
      local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
      alarm 60*60;	# an hour should be generous enough
      runcmd("svn up -r$rev");
      alarm 0;
    };

    if ($@ || $?>>8 != 0 || !-f $expected_svn_file) {
      print "simple 'svn update' failed. performing full checkout instead...\n";
      $needs_checkout = 1;
    }
  } else {
    $needs_checkout = 1;
  }

  if ($needs_checkout) {
    my $svnurl = get_svn_url();
    runcmd("rm -rf $hfdir");
    eval {
      local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
      alarm 60*60;	# an hour should be generous enough
      runcmd("svn co $svnurl\@$rev $hfdir");
      alarm 0;
    };
    if ($@ || $?>>8 != 0 || !-f $expected_svn_file) { die "svn co failed"; }
  }
  chdir "$hfdir" or die "cannot chdir $hfdir";

  # ensure these are rebuilt
  runcmd "rm -f rules/70_sandbox.cf rules/72_active.cf";

  # do this twice in case Makefile.PL is rebuilt
  runcmd "( make build_rules || $perl_path Makefile.PL;make build_rules )</dev/null";

  chdir "$hfdir/masses" or die "cannot chdir $hfdir/masses";
  open (FREQS, "$perl_path hit-frequencies -TxpagP $args |")
            or die "cannot run ./hit-frequencies $args |";

  chdir $logsdir;
}

# ---------------------------------------------------------------------------
  
sub get_svn_url {
  open (SVNINFO, "svn info $cf{tree}|") or die "cannot run svn info";
  my $svnurl;
  while (<SVNINFO>) {
    /URL: (.*)$/ and $svnurl = $1;
  }
  close SVNINFO or die "cannot close svn info";
  return $svnurl;
}

# ---------------------------------------------------------------------------

sub gen_report_logs {
  my ($fname, $hamref, $spamref) = @_;

  my $bytes = 0;
  foreach my $f (@$hamref, @$spamref) {
    $f =~ s/[^-\._A-Za-z0-9]+/_/gs;    # sanitize!
    my $zf = "$fname-$f.gz";

    runcmd("gzip -c < $f > $zf.$$");
    if ($? >> 8 != 0) {
      warn "gzip -c < $f > $zf.$$ failed";
    }

    rename("$zf.$$", $zf) or
                  warn "cannot rename $zf.$$ to $zf";
    $bytes += (-s $zf);
  }

  # this is just so we won't recompress these logs if re-run
  open TOUCH, ">$fname" or warn "cannot write to $fname";
  close TOUCH;

  return $bytes;
}

# ---------------------------------------------------------------------------

sub gen_report_freqs_all {
  my ($tmpfname, $hamref, $spamref, $rev) = @_;
  my %spam;
  my %ham;
  my @output;
  
  for my $file (@$spamref) {
    my $u = extract_username_from_log_filename($file);
    $spam{$u} = $file; print "username in spam log: $u\n";
  }
  for my $file (@$hamref) {
    my $u = extract_username_from_log_filename($file);
    $ham{$u} = $file; print "username in ham log: $u\n";
  }

  if (scalar keys %spam <= 0 && scalar keys %ham <= 0) {
    warn "no files found";
    return;
  }

  my $tmp_h_all = "$cf{tmp}/hamall.log.$$";
  my $tmp_s_all = "$cf{tmp}/spamall.log.$$";
  unlink $tmp_h_all, $tmp_s_all;

  my %alluserkeys;
  for my $k (keys %spam, keys %ham) {
    next if exists $alluserkeys{$k}; undef $alluserkeys{$k};
  }
  for my $user (sort keys %alluserkeys) {
    my $files_h = [];
    if ($ham{$user}) { $files_h = [ "$logsdir/$ham{$user}" ]; }
    my $files_s = [];
    if ($spam{$user}) { $files_s = [ "$logsdir/$spam{$user}" ]; }

    time_filter_fileset($files_h, $tmp_h, $OLDEST_HAM_WEEKS,  undef);
    time_filter_fileset($files_s, $tmp_s, $OLDEST_SPAM_WEEKS, undef);

    start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s $tmp_h");
    while(<FREQS>) {
      chomp; push @output, "$_:$user\n";
    }
    close(FREQS);

    runcmd("cat $tmp_h >> $tmp_h_all");
    runcmd("cat $tmp_s >> $tmp_s_all");
  }

  if (-z $tmp_h_all && -z $tmp_s_all) {
    warn "time_filter_fileset() returned empty logs. not creating freqs!";
    return;     # we'll try again later
  }

  start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s_all $tmp_h_all");
  while(<FREQS>) {
    /\s0\s+0\s+0.500\s+0.00\s+0.00\s+\(all messages\)/ and $no_messages_in_freqs = 1;
    push @output, $_;
  }
  close(FREQS);
  for (sort sort_all @output) { print OUT; }
}

# ---------------------------------------------------------------------------

sub extract_username_from_log_filename {
  my $u = shift;
  # spam-someuser.log, spam-net-bb-jm.20090518-r775863-n.log
  $u =~ s/\.log$//; $u =~ s/.*\///; $u =~ s/^(h|sp)am-(?:net-)?//;
  $u =~ s/\.\d{8}-r\d+-[a-z]//;       # daterev
  return $u;
}

# ---------------------------------------------------------------------------

sub gen_report_freqs_age {
  my ($tmpfname, $hamref, $spamref, $rev) = @_;
  my @output;

  for my $which (("0-1", "1-2", "2-3", "3-6")) {
    my ($before, $after) = split(/-/, $which);
    time_filter_fileset($hamref, $tmp_h, $after, $before);
    time_filter_fileset($spamref, $tmp_s, $after, $before);

    # print out by age
    start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s $tmp_h");
    while(<FREQS>) {
      chomp; push @output, "$_:$which\n";
    }
    close(FREQS);
  }
  for (sort sort_all @output) { print OUT; }
}

# ---------------------------------------------------------------------------

sub gen_report_freqs_basic {
  my ($tmpfname, $hamref, $spamref, $rev) = @_;

  time_filter_fileset($hamref, $tmp_h, $OLDEST_HAM_WEEKS, undef);
  time_filter_fileset($spamref, $tmp_s, $OLDEST_SPAM_WEEKS, undef);

  if (-z $tmp_h && -z $tmp_s) {
    warn "time_filter_fileset() returned empty logs. not creating freqs!";
    return;     # we'll try again later
  }

  start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s $tmp_h");
  while(<FREQS>) {
    /\s0\s+0\s+0.500\s+0.00\s+0.00\s+\(all messages\)/ and $no_messages_in_freqs = 1;
    print(OUT);
  }
  close(FREQS);
}

sub runcmd {
  my ($cmd) = @_;
  print "[$cmd]\n";
  system $cmd;
}
