#!/usr/bin/perl -w

#################################################################################
# 
# Web Secretary Ver 1.31
#
# Retrieves a list of web pages and send the pages via email to
# a designated recipient. It can optionally compare the page with a
# previously retrieved page, highlight the differences and send the
# modified page to the recipient instead.
#
# Copyright (C) 1998  Chew Wei Yih
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
#################################################################################

use LWP::UserAgent;

# Print introduction
print "Web Secretary Ver 1.31\n";
print "By Chew Wei Yih Copyleft (c) 1998\n\n";

# Get today's date in the format we want.
$today = `date '+%d %B %Y (%a)'`; chop $today;

# Prepare pathnames.
($base = $0) =~ s:[^/]+$::;
$archive = "archive/";
$outgoing = $base . "index.html";
$page_current = $base . "retrieve.html";

# Set default values
%defaults =
(
    URL       => "",
    Auth      => "none",
    Name      => "",
    Prefix    => "",
    Diff      => "webdiff",
    Hicolor   => "blue",
    Ignore    => "none",
    IgnoreURL => "none",
    Email     => "",
    Proxy     => "",
    ProxyAuth => "none",
    Tmin      => 0,
    Tmax      => 99999,
);
%siteinfo = %defaults;

# Default return code
$rc = 0;

# Loop through input file and process all sites listed
while(<>)
{
    chop $_;
    s/^\s*//;

    # Ignore comments
    if (m/^#.*?$/) { next; }
    
    # Handle non-empty lines
    if (length != 0)
    {
        $rc = HandleInput();
        if ($rc != 0) { last; }
        next;
    }

    # Handle line separators
    $rc = HandleSite();
    if ($rc != 0) { last; }
    %siteinfo = %defaults;
}

# Process last site if available
if ($rc == 0 && $siteinfo{URL} ne "") { $rc = HandleSite(); }

# Delete temp files
unlink($outgoing);
unlink($page_current);

# End of main program
exit $rc;

# Handle setting of parameters
# Params: none
sub HandleInput()
{
    # Get keyword, value pair
    ($keyword, $value) = split(/=/, $_, 2);
    $keyword   =~ s/^\s*(.*?)\s*$/$1/;
    $value     =~ s/^\s*\"*(.*?)\"*\s*$/$1/;

    # Check if valid keyword
    if ($keyword ne "URL" && $keyword ne "Auth" && $keyword ne "Name" && $keyword ne "Prefix" &&
        $keyword ne "Diff" && $keyword ne "Hicolor" && $keyword ne "Ignore" && $keyword ne "Email" &&
        $keyword ne "Tmin" && $keyword ne "Tmax" && $keyword ne "Proxy" && $keyword ne "IgnoreURL" &&
        $keyword ne "ProxyAuth")
    {
        print qq(Unrecognized keyword in line $.: "$_".\n);
        return -1;
    }

    $siteinfo{$keyword} = $value;
    return 0;
}

# Handle downloading, highlighting and mailing of each site.
# Params: none
# Returns: 0 => OK, -1 => Error
sub HandleSite()
{
    # Get parameter values for this page
    $url       = $siteinfo{URL};
    $auth      = $siteinfo{Auth};
    $name      = $siteinfo{Name};
    $prefix    = $siteinfo{Prefix};
    $diff      = $siteinfo{Diff};
    $hicolor   = $siteinfo{Hicolor};
    $ignore    = $siteinfo{Ignore};
    $ignoreurl = $siteinfo{IgnoreURL};
    $email     = $siteinfo{Email};
    $proxy     = $siteinfo{Proxy};
    $proxyAuth = $siteinfo{ProxyAuth};
    $tmin      = $siteinfo{Tmin};
    $tmax      = $siteinfo{Tmax};

    # If block without URL, assume parameter setting block and update default values
    if ($url eq "")
    {
        %defaults = %siteinfo;
        return 0;
    }

    # If essential parameters are not present, abort with error
    if ($name eq "" || $prefix eq "" || $email eq "")
    {
        print "Name, prefix or email info missing from URL: $url.\n";
        return -1;
    }

    # Prepare for downloading this page
    print "Processing => $url ($name) ...\n";
    $page_previous = $base . $archive . $prefix . ".html";
    $page_archive = $base . $archive . $prefix . ".old.html";
    $page_previousExists = 1;
    open(FILE, $page_previous) or $page_previousExists = 0;
    close(FILE);
    $subj = "$name - $today";
    $webdiff =
        $base .
        "webdiff -archive $page_previous -current $page_current -out $outgoing " .
        "-hicolor $hicolor -ignore $ignore -ignoreurl $ignoreurl -tmin $tmin -tmax $tmax";

    # Download URL using LWP
    $ua = new LWP::UserAgent;
    $ua->agent("websec/1.0");
    $ua->env_proxy;
    if ($proxy ne "") { $ua->proxy(http => $proxy); }
    $req = new HTTP::Request('GET', $url);
    if ($auth ne "none") { $req->authorization_basic(split(/:/, $auth, 2)); }
    if ($proxyAuth ne "none") { $req->proxy_authorization_basic(split(/:/, $proxyAuth, 2)); }

    # Try up to 3 times to download URL
    for(1..3)
    {
        $resp = $ua->request($req);
        if ($resp->is_success) { last; }
    }

    # If URL is successfully downloaded
    if ($resp->is_success)
    {
        open (HTML_FILE, ">$page_current");
        print HTML_FILE "<!-- X-URL: ", $resp->base, " -->\n";
        print HTML_FILE "<BASE HREF= \"", $resp->base. "\">\n";
        print HTML_FILE $resp->content;
        close HTML_FILE;

        if ($diff eq "webdiff")
        {
            if ($page_previousExists == 1)
            {
                print "Highlighting differences from previous version of webpage ...\n";
                $rc = system($webdiff);
                if ($rc != 0)
                {
                    print "Sending highlighted page to $email...\n";
                    MailDocument($outgoing, $subj, $email);
                }
                else
                {
                    print "No changes were detected.\n";
                }
                rename $page_previous, $page_archive;
                rename $page_current, $page_previous;
            }
            else
            {
                print "No previous version for this page. Storing in archive...\n";
                rename $page_current, $page_previous;
            }
        }
        else
        {
            MailDocument($page_current, $subj, $email);
            if ($page_previousExists) { rename $page_previous, $page_archive; }
            rename $page_current, $page_previous;
        }
    }
    # If unable to download URL
    else
    {
        print "Unable to retrieve page today.\n";
        $errmsg = 
            "Unable to retrieve $name ($url) today.\n\n" .
            "Detailed error as follows:\n" . $resp->error_as_HTML;
        MailMessage($errmsg, $subj, $email);        
    }

    return 0;
}

# Mail message
# Params: message, subject, recipient
# Returns: none
sub MailMessage()
{
    my $message = shift(@_);
    my $subject = shift(@_);
    my $recipient = shift(@_);

    $req = HTTP::Request->new(POST => "mailto:" . $recipient);
    $req->header("Subject", $subject);
    $req->header("Content-type", "text/plain; charset=us-ascii");
    $req->header("Content-Transfer-Encoding", "7bit");
    $req->content($message);

    $ua = new LWP::UserAgent;
    $ua->request($req);
}

# Mail HTML document.
# Params: filename, subject, recipient
# Returns: none
sub MailDocument()
{
    my $filename = shift(@_);
    my $subject = shift(@_);
    my $recipient = shift(@_);
    my $tmpstr = $/;

    undef $/;
    open(FILE, "$filename") or die "Cannot open $filename: $!\n";

    $req = HTTP::Request->new(POST => "mailto:" . $recipient);
    $req->header("Subject", $subject);
    $req->header("Content-type", "text/html");
    $req->header("Content-Transfer-Encoding", "7bit");
    $req->content(<FILE>);

    $ua = new LWP::UserAgent;
    $ua->request($req);

    close(FILE);
    $/ = $tmpstr;
}
