#!/usr/bin/perl -w

#################################################################################
#
# DejaSearch Ver 1.8.4
#
# Frontend to DejaNews (http://www.dejanews.com/) to retrieve and consolidate
# all search results into one single HTML file.
#
# Copyright (C) 1998  Chew Wei Yih, Victor <vchew@post1.com>
#                     Steffen Ullrich <coyote.frank@gmx.net>
#                     Frank de Lange <frank@unternet.org>
#                     Dan Shiovitz <dans@drizzle.com>
#
# This program is free software; you can redistribute it and/or 
# modify it under the terms of the GNU General Public License   
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
#################################################################################

use Socket;
use Getopt::Long;
use POSIX; # for strftime

# Check whether DejaSearch is used as CGI script or not.
if ($ENV{'REQUEST_METHOD'})
{
    require CGI;
    $CGI = 1;
}

# Always reject messages from the following newsgroups
@rejectList =
(
    "alt.2600",
);

# These browsers don't like frames...
@non_frames_browsers =
(
    "mosaic",
    "Lynx",
    "w3m",
);

# These are the search patterns to use for both the `classic' and `new'
# versions of dejanews. If the interface changes, these patterns will have
# to be changed to match.
%deja = (
    new =>
    {
        base => 'http://www.deja.com',
	    get_url => '/dnquery.xp?DBS=1&VW=',
        match_message => '(\\d{2}/\\d{2}/\\d{4}).*?<A HREF=\"(http://.{3}\\.deja\\.com/getdoc\\.xp[^>]*?hitnum=\\d+)\">(.*?)</A>.*?</TD>\s*<TD>\s*<B>(.*?)</B>\s*</TD>\s*<TD>.*?<A HREF.*?>(.*?)</A></TD>',
	    get_next_url => '<A HREF=\"(http://.{3}\\.deja\\.com/dnquery\\.xp\\?search=next[^>]*?&CONTEXT=[\\d\\.]+)\">Next</A>',
	    match_forum => 'Forum</a>:\s*<b.*?>(.*?)</b></span><br>',
        match_thread => '<A HREF=\"(http://.{3}\\.deja\\.com/viewthread\\.xp\?.*?)\">Thread</A>',
	    match_subject => '<TD>Subject:.*?</TD>\s*<TD.*?>\s*<B>(.*?)</B>\s*</TD>\s*</TR>',
	    match_date => '<TD>Date:</TD><TD>(.*?)</TD></TR>',
	    match_author => '<TD>Author:</TD>\s*<TD><B><A HREF.*?>(.*?)</B></A>.*?<A HREF=\"mailto:(.*?)\">.*?</A>.*?</TD></TR>',
	    match_content => '<TABLE.*?>\s*<TR>\s*<TD.*?>\s*<P>\s*(<P>\s*.*?)\s*</TD>\s*</TR>\s*</TABLE>',
    },
    classic =>
    {
	    base => 'http://www.deja.com',
	    get_url => '/=dnc/dnquery.xp?DBS=1&VW=',
        match_message => '(\\d{2}/\\d{2}/\\d{2}).*?<A HREF=\"(http://.{3}\\.deja\\.com/=dnc/getdoc\\.xp.*?)\">(.*?)</a>.*?<B>(.*?)</B>.*?<FONT.*?>(.*?)</FONT>',
        get_next_url => '<A HREF=\"(http://.{3}\\.deja\\.com/=dnc/dnquery\\.xp\\?search=next[^>]*?&CONTEXT=[\\d\\.]+)\">Next messages',
	    match_forum => 'Forum:</b></font></td>.*?<td.*?>.*?</td>.*?<td><font size=2>(.*?)</font>',
        match_thread => '<A HREF=\"(http://.{3}\.deja\.com/=dnc/viewthread\.xp\?.*?\").*?><B>Return</B></A>',
	    match_subject => '<FONT.*?color=\"#[^\"]+\"><B>(.*?)</B></FONT><font size=4>&nbsp;</font>',
	    match_date => 'Date:</b></FONT></td>.*?<td.*?>.*?</td>.*?<td><font size=2>(\d{4}/\d{2}/\d{2})</font></td>',
	    match_author => '<TD.*?><FONT.*?><B>Author:</B></FONT></TD>\s*<TD.*?>.*?</TD>\s*<TD.*?><FONT.*?><B>(.*?)</B>.*?<A HREF=\"mailto:(.*?)\">.*?</A>.*?</TD></TR>',
	    match_content => '<hr width=600 align=left>.*?(<pre>.*?</pre>).*?<hr width=600 align=left>',
    },
    mbox =>
    {
	    base => 'http://www.deja.com',
	    get_url => '/=dnc/dnquery.xp?DBS=1&VW=',
        match_message => '(\\d{2}/\\d{2}/\\d{2}).*?<A HREF=\"(http://.{3}\\.deja\\.com/=dnc/getdoc\\.xp.*?)\">(.*?)</a>.*?<B>(.*?)</B>.*?<FONT.*?>(.*?)</FONT>',
        get_next_url => '<A HREF=\"(http://.{3}\\.deja\\.com/=dnc/dnquery\\.xp\\?search=next[^>]*?&CONTEXT=[\\d\\.]+)\">Next messages',
        match_forum => 'Newsgroups: ([^\n\r]*)',
        match_thread => '',
        match_subject => 'Subject: ([^\n\r]*)',
        match_date => 'Date:\s(.*?\s.*?\s.*?)\s',
        match_author => '(?:([^\n\r]*) (\([^\n\r]*\)))|(?:([^\n\r]*) (<[^\n\r]*>))',
        match_content => '(.*)',
    },
);

# Initialize common parameters
$SUMMARY_ONLY = 0;
$interface = "new";
$type = "all";
$proxy = $ENV{'http_proxy'};
$sleep = 0;
$maxcount = 99999;
$output = "summary.html";
$status = 1;
$verbose = 1;
$fromdate = "";
$todate = "";
$debug = 0;

# Read user config file
$auth = "";
$authfile = $ENV{'HOME'} . "/.dejasearchrc";
if (-e $authfile)
{
    $mode = (stat($authfile))[2] & 0777;
    if ($mode != 0400 && $mode != 0600)
    {
        printf("Permissions for .dejasearchrc must be 0400 or 0600!\n");
        exit(-1);
    }

    open(CONFIG, "< $authfile") or die "Unable to open $authfile for reading!\n";
    $auth = <CONFIG>;
    if (defined($auth))
    {
        chomp $auth;
    }
    else
    {
        $auth = "";
    }
    close(CONFIG);

    print "Using authentication information from $authfile ...\n";
}

# Different initialization for command-line or CGI version
if ($CGI)
{
    # We got called through CGI, so we'll present the user with a query form and results window.
    $request = new CGI;                         # initiate CGI object
    $output = "-";                              # set output file to stdout
    $verbose = 0;                               # disable search messages
    $status = 0;                                # disable download messages
    $frames = 1;                                # by default we want frames
    $TITLE = "DejaSearch";                      # give the window a title...
    $interface = $request->param("INTERFACE");  # which interface do we use?
    print $request->header;                     # print the MIME stuff

    # Check if the user uses a non-frames capable browser. Turn frames off if yes.
    foreach $browser (@non_frames_browsers)
    {
        if ($ENV{'HTTP_USER_AGENT'} =~ m|$browser|i) { $frames = 0; }
    }

    if ($frames)
    {
        # Frames version
        $request_frame = "request";
        $response_frame = "response";
        $message_frame = "message";
        $path_info = $request->path_info;
        if (!$path_info)    # if no path info, create the frameset...
        {
            &print_frameset;
            exit 0;
        }
        &print_request if $path_info=~/$request_frame/;
        &print_response if $path_info=~/$response_frame/;
        &print_message if $path_info=~/$message_frame/;
    }
    else
    {
        # Frames-less version
        $request_frame="";
        $response_frame="";
        $message_frame="";
        &print_request;
        if ($request->param('FETCH'))
        {
            &print_message;
	    }
        else
        { 
            &print_response;
        }
    }

    &print_end;
}
else
{
    # no CGI, so we assume we got called by a mere human on the command line.
    ParseArguments();

    # Ctrl-C will save partial search results
    $SIG{ 'INT' } = sub
    {
        $SIG{ 'INT' } = 'IGNORE';
        OutputResults();  
        exit 0;
    };

    # Open output file first, just in case it cannot be created.
    if (! $CGI)
    {
    open (OUTPUT, ">$output") or die "Unable to write to $output:$!\n";
    }

    # Construct query string
    ConstructQueryString();

    # Prepare for search
    $count = 0;
    $spage = 1;

    # Check recent messages if necessary
    if ($type eq "all" || $type eq "recent") { DoSearch("dncurrent"); }

    # Check past messages if necessary
    if ($count < $maxcount && ($type eq "all" || $type eq "past")) { DoSearch("dnold"); }

    # Output search results!
    OutputResults();

    # Job done!
    if (! $CGI)
    {
    close(OUTPUT);
    }
    exit 0;
}

# ---------------------------------------------------------------------------- 
# Parse command line arguments. Print help screen if no arguments given.
# ---------------------------------------------------------------------------- 
sub ParseArguments
{
    # Print help if no arguments passed
    if ($#ARGV < 0)
    {
        print "DejaSearch Ver 1.8.4\n";
        print "By Chew Wei Yih Copyleft (c) 1999\n\n";
        print "Usage:\n";
        print "  dejasearch [-proxy p] [-max m] [-output o] <search keywords>\n\n";
        print "Options:\n";
        print "  -proxy    <URL>      Proxy server in http://hostname:port format\n";
        print "  -max      <num>      Maximum number of messages to retrieve\n";
        print "  -output   <filename> Output HTML file (default: summary.html)\n";
        print "  -type     <type>     Valid values are recent, old and all (default: all)\n";
        print "  -format   <type>     Dejanews search results format.\n";
        print "                       Valid values are classic, new and mbox (default: new)\n";
        print "  -fromdate <date>     Date to limit search from (eg. Apr+1+1997)\n";
        print "  -todate   <date>     Date to limit search to (eg. Apr+8+1997)\n";
        print "  -[no]status          Display download status (default: yes)\n";
        print "  -[no]verbose         Display search status (default: yes).\n";
        print "                       Note that -noverbose implies -nostatus.\n";
        print "  -sleep   <secs>      Sleep given number of secs between each retrieval.\n";
        print "                       (default: 0)\n\n";

        print "Keywords can be separated by the connectors:\n\n";
        print "  &  - AND             beans & rice\n";
        print "  |  - OR              camel | llama\n";
        print "  &! - AND NOT         clam &! chowder\n";
        print "  ^  - NEAR            lucas ^ spielberg\n\n";

        print "Keywords can be combined with:\n\n";
        print "  Quote marks:         \"the far side\"\n";
        print "  Wildcards:           psych*\n";
        print "  Parentheses:         scully & (xfiles | x-files)\n";
        print "  Braces:              {monkey monkeying}\n\n";

        print "Keywords can be preceded by the context operators:\n\n";
        print "  ~a  - Author         ~a demos\@deja.com\n";
        print "  ~s  - Subject        ~s chess\n";
        print "  ~g  - Newsgroup      ~g alt.love\n";
        print "  ~dc - Creation date  ~dc 1996/12/31\n\n";

        exit -1;
    }

    # Parse arguments
    $result = GetOptions(
        'proxy:s'    => \$proxy,
        'max:i'      => \$maxcount,
        'output:s'   => \$output,
        'type:s'     => \$type,
        'status!'    => \$status,
        'verbose!'   => \$verbose,
        'sleep:i'    => \$sleep,
        'fromdate:s' => \$fromdate,
        'todate:s'   => \$todate,
        'format:s'   => \$interface);

    if ($maxcount < 1)
    {
        print STDERR "The -max value must be greater than 0!\n\n";
        exit -1;
    }

    if ($type !~ /^(all|recent|past)$/)
    {
        print STDERR "The -type value must be recent, past or all!\n\n";
        exit -1;
    }

    if ($interface !~ /^(new|classic|mbox)$/)
    {
        print STDERR "The -interface value must be classic, new or mbox!\n\n";
        exit -1;
    }

    if ($result eq "") { exit 0; }
    $param = join(' ',@ARGV);

}

# ---------------------------------------------------------------------------- 
# Construct query string
# ---------------------------------------------------------------------------- 
sub ConstructQueryString
{
    $query = $param;
    $query =~ s|^\s+||sig;
    $query =~ s|\s+$||sig;
    $query =~ s|\s+|\+|sig;
    $query =~ s|&|\%26|sig;
    $query =~ s|!|\%21|sig;
    $query =~ s|\^|\%5E|sig;
    $query =~ s|\(|\%28|sig;
    $query =~ s|\)|\%29|sig;
    $query =~ s|~|\%7E|sig;
    $query =~ s:\Q|\E:\%7C:sig;
    $query =~ s|{|\%7B|sig;
    $query =~ s|}|\%7D|sig;
    $query =~ s|\[|\%22|sig;
    $query =~ s|\]|\%22|sig;
    $query =~ s|"|\%22|sig;
}

# ----------------------------------------------------------------------------       
# Perform search.
# ----------------------------------------------------------------------------       
sub DoSearch
{
    my $type = shift;

    $url = $deja{$interface}{'base'} . $deja{$interface}{'get_url'} . "&QRY=$query&svcclass=$type" .
           "&fromdate=" . $fromdate . "&todate=" . $todate;
    while($url ne "")
    {
        # Fetch the first page of search results
        if ($verbose && !$CGI) { print STDERR "Fetching page $spage of search results ...\n"; }
        GetURL($url) or die "Failed to retrieve $url: ", $resp_code, "\n", $resp_buf;
        ($page = $resp_buf) =~ s/[\r\n]/ /sig;
        $spage++;
    
        # Parse search results
        $url = "";

        while($page =~ m|$deja{$interface}{'match_message'}|sig)
        {
            if ($SUMMARY_ONLY)
            {
                # Since we do not download each message, we need to get
                # these values from the results page. Fortunately this is
                # no problem, since everything but the actual message content
                # is available...
                ($index[$count], $date[$count], $fetch[$count], $subject[$count], $forum[$count], $author[$count]) =
                    ($count, $1, $2, $3, $4, $5);
                $fetch[$count] =~ s|AN=(\d+)\.\d|AN=$1|sig;

		        # Rudimentary sanity check. The actual message might still be
		        # unavailable, but we won't check for that since it takes
		        # too much time.
		        if ($date[$count] eq "" || $author[$count] eq "" || $subject[$count] eq "" || $forum[$count] eq "")
		        {
		            next;
		        }
            }
            else
            {
                $fetch[$count] = $2;
                if ($debug) { print "Debug: Fetching $2\n"; }
                $fetch[$count] =~ s|AN=(\d+)\.\d|AN=$1|sig;
                $fetch[$count] .= "&fmt=text" if ($interface eq "mbox");
                $resp_buf = "";
                GetURL($fetch[$count]) or die "Failed to retrieve $fetch[$count]:", $resp_code, "\n", $resp_buf;

                ($index[$count], $rc, $thread[$count], $date[$count], $subject[$count], $forum[$count],
                    $author[$count], $author_email[$count], $content[$count]) = ($count, ProcessPage($resp_buf));

                if ($rc != 0) { next; };
                if ($verbose)
                {
                    printf STDERR "%3d. %s - %s\n", $count+1, $date[$count], $subject[$count];
                }
            }    
           
            if (++$count >= $maxcount) { last; }
            if ($sleep != 0) { sleep($sleep); }
        }
    
        # Get next summary page
        while($count < $maxcount && $page =~ m|$deja{$interface}{'get_next_url'}|sig)
        {
            $url = $1;
        }
    }
}

# ----------------------------------------------------------------------------       
# Output search results to file (might be stdout).
# ----------------------------------------------------------------------------       
sub OutputResults
{
    # If nothing found, exit
    if ($#index < 0)
    {
        if ($CGI)
        {
            print qq(<BODY BGCOLOR="#ffffff">);
            print qq(<b>Nothing found.</b>\n);
            return;
        }
        else
        {
            print "Nothing found.          \n";
            exit -1;
        }
    }


    # Output results header
    if ($CGI)
    {
        print qq(<BODY BGCOLOR="#ffffff">);
    }
    else
    {
        # Open output file in append mode
        open (OUTPUT, ">> $output") or die "Unable to write to $output:$!\n";

        # In mbox mode, print no headers
        if ($interface ne "mbox")
        {
            print OUTPUT qq(<HTML>\n<HEAD><TITLE>Dejanews Search Results</TITLE></HEAD>\n<BODY BGCOLOR="#ffffff">\n);
            print OUTPUT qq(<H1>Dejanews Search Results for:</H1></A>\n);
            print OUTPUT qq(<H3>$param</H3>\n);
        }
    }

    # Sort search results
    @sortedIndex = sort SortMessages @index;
     
    # Output hyperlinked table-of-contents
    OutputTOC() if ($interface ne "mbox");
    
    # Output messages if not in summary mode
    if (!$SUMMARY_ONLY)
    {
        foreach $i (@sortedIndex) { OutputMessage(); }
    }

    # Job done!
    if ($CGI)
    {
        print qq(</BODY>\n</HTML>\n) if ($interface ne "mbox");
    }
    else
    {
        print OUTPUT qq(</BODY>\n</HTML>\n);
    }
}

# ----------------------------------------------------------------------------
# Output TOC of search results.
# ----------------------------------------------------------------------------       
sub OutputTOC
{
    if ($CGI)
    {
        if (!$SUMMARY_ONLY)
        {
            print "<P><FONT SIZE=2>A total of " . ($#date+1) .
                " messages matching your search criteria have been found.</FONT><BR><BR>\n";
        }
        print qq(<TABLE BORDER=0 CELLPADDING=0 CELLSPACING=1>\n);
        print qq(<TR BGCOLOR="#dddddd">\n);
        print qq(<TH><FONT SIZE=2><B>No.</B></FONT></TH>\n);
        print qq(<TH><FONT SIZE=2><B>Subject</B></FONT></TH>\n);
        print qq(<TH><FONT SIZE=2><B>Date</B></FONT></TH>\n);
        print qq(<TH><FONT SIZE=2><B>Author</B></FONT></TH>\n);
        print qq(<TH><FONT SIZE=2><B>Newsgroup</B></FONT></TH>\n);
        $count = 1;
        foreach $i (@sortedIndex)
        {
            my $ssubject = substr $subject[$i], 0, 50;
            my $sauthor = substr $author[$i], 0, 15;
            my $sforum = substr $forum[$i], 0, 30;
            print qq(<TR>\n);
            print qq(<TD><FONT SIZE=2>&nbsp;&nbsp;$count.&nbsp;&nbsp;</FONT></TD>\n);
            if ($SUMMARY_ONLY)
            {
                print qq(<TD><FONT SIZE=2>&nbsp;&nbsp;<A HREF=\"$message_frame?FETCH=$fetch[$i]&INTERFACE=$interface\" TARGET=\"$message_frame\">$ssubject</A>&nbsp;&nbsp;</FONT></TD>\n);
            }
            else
            {
                print qq(<TD><FONT SIZE=2><A NAME=\"TOC$i\">&nbsp;&nbsp;<A HREF=\"#M$i\">$ssubject</A>&nbsp;&nbsp;</FONT></TD>\n);
            }
            print qq(<TD><FONT SIZE=2 COLOR="#0000ff">&nbsp;&nbsp;$date[$i]&nbsp;&nbsp;</FONT></TD>\n);
            print qq(<TD><FONT SIZE=2 COLOR="#0000ff">&nbsp;&nbsp;$sauthor&nbsp;&nbsp;</FONT></TD>\n);
            print qq(<TD><FONT SIZE=2 COLOR="#0000ff">&nbsp;&nbsp;<B>$sforum</B>&nbsp;&nbsp;</FONT></TD>\n);
            print qq(</TR>\n);
            $count++;
        }
        print qq(</TABLE>\n);
        print qq(<BR>\n);
        if (!$SUMMARY_ONLY) { print qq(<HR>\n); }
    }
    else
    {
        if (!$SUMMARY_ONLY)
        {
            print OUTPUT "<P><FONT SIZE=2>A total of " . ($#date+1) .
                " messages matching your search criteria have been found.</FONT><BR><BR>\n";
        }
        print OUTPUT qq(<TABLE BORDER=0 CELLPADDING=0 CELLSPACING=1>\n);
        print OUTPUT qq(<TR BGCOLOR="#dddddd">\n);
        print OUTPUT qq(<TH><FONT SIZE=2><B>No.</B></FONT></TH>\n);
        print OUTPUT qq(<TH><FONT SIZE=2><B>Subject</B></FONT></TH>\n);
        print OUTPUT qq(<TH><FONT SIZE=2><B>Date</B></FONT></TH>\n);
        print OUTPUT qq(<TH><FONT SIZE=2><B>Author</B></FONT></TH>\n);
        print OUTPUT qq(<TH><FONT SIZE=2><B>Newsgroup</B></FONT></TH>\n);
        $count = 1;
        foreach $i (@sortedIndex)
        {
            my $ssubject = substr $subject[$i], 0, 50;
            my $sauthor = substr $author[$i], 0, 15;
            my $sforum = substr $forum[$i], 0, 30;
            print OUTPUT qq(<TR>\n);
            print OUTPUT qq(<TD><FONT SIZE=2>&nbsp;&nbsp;$count.&nbsp;&nbsp;</FONT></TD>\n);
            if ($SUMMARY_ONLY)
            {
                print OUTPUT qq(<TD><FONT SIZE=2>&nbsp;&nbsp;<A HREF=\"$message_frame?FETCH=$fetch[$i]&INTERFACE=$interface\" TARGET=\"$message_frame\">$ssubject</A>&nbsp;&nbsp;</FONT></TD>\n);
            }
            else
            {
                print OUTPUT qq(<TD><FONT SIZE=2><A NAME=\"TOC$i\">&nbsp;&nbsp;<A HREF=\"#M$i\">$ssubject</A>&nbsp;&nbsp;</FONT></TD>\n);
            }
            print OUTPUT qq(<TD><FONT SIZE=2 COLOR="#0000ff">&nbsp;&nbsp;$date[$i]&nbsp;&nbsp;</FONT></TD>\n);
            print OUTPUT qq(<TD><FONT SIZE=2 COLOR="#0000ff">&nbsp;&nbsp;$sauthor&nbsp;&nbsp;</FONT></TD>\n);
            print OUTPUT qq(<TD><FONT SIZE=2 COLOR="#0000ff">&nbsp;&nbsp;<B>$sforum</B>&nbsp;&nbsp;</FONT></TD>\n);
            print OUTPUT qq(</TR>\n);
            $count++;
        }
        print OUTPUT qq(</TABLE>\n);
        print OUTPUT qq(<BR>\n);
        if (!$SUMMARY_ONLY) { print OUTPUT qq(<HR>\n); }
    }
}

# ----------------------------------------------------------------------------
# Output messages retrieved during search.
# ----------------------------------------------------------------------------       
sub OutputMessage
{
    if ($CGI)
    {
        print qq(<HTML>\n<HEAD><TITLE>Dejanews: $subject[$i] - $date[$i] - $author[$i] - $forum[$i]</TITLE></HEAD>\n<BODY BGCOLOR=#ffffff>\n);
        print qq(<PRE>\n);
        print qq(<A NAME=\"M$i\">);
        print qq(<FONT COLOR="#0000ff"><B>Date:      $date[$i]</B></FONT></A>\n);
        print qq(<FONT COLOR="#0000ff"><B>Subject:   $subject[$i]</B></FONT>\n);
        print qq(<FONT COLOR="#0000ff"><B>Newsgroup: $forum[$i]</B></FONT>\n);
        print qq(<FONT COLOR="#0000ff"><B>Author:    $author[$i]&nbsp;&lt;<A HREF="mailto:$author_email[$i]">$author_email[$i]</A>&gt;</B></FONT>\n);
        if ( !$SUMMARY_ONLY)
        {
            print qq(<FONT SIZE=-1><A HREF="#TOC$i">Top</A>&nbsp;);
        }
        print qq(<A HREF="$thread[$i]" TARGET="_blank">Thread</A>&nbsp;);
        print qq(<A HREF="$fetch[$i]" TARGET="_blank">Message</A></FONT>\n\n);
        print qq(</PRE>\n);
        print $content[$i];
        if (!$SUMMARY_ONLY) { print qq(<HR>\n); }
    }
    else
    {
        open (OUTPUT, ">> $output") or die "Unable to write to $output:$!\n";

        if ($interface eq "mbox")
        {  
            my $datestr = "Tue Sep 1 00:00:00 1992";

            if ($content[$i] =~ /NNTP-Posting-Date:.*?([0-9]+ \w+ [0-9]+ [0-9]+:[0-9]+:[0-9]+ \w+)/)
            {
                my $realtime = $1;
                $date[$i] =~ s/[0-9]+ \w+ [0-9]+ [0-9]+:[0-9]+:[0-9]+ \w+/$realtime/;

                # also, fix the real date
                $content[$i] =~ s/Date: [0-9]+ \w+ [0-9]+ [0-9]+:[0-9]+:[0-9]+ \w+/Date: $realtime/;
            }

            if ($date[$i] =~ /([0-9]+) (\w+) ([0-9]+) ([0-9]+):([0-9]+):([0-9]+)/)
            {
                my %mons = ('Jan' => 0, 'Feb' => 1, 'Mar' => 2, 'Apr' => 3,
                            'May' => 4, 'Jun' => 5, 'Jul' => 6, 'Aug' => 7,
                            'Sep' => 8, 'Oct' => 9, 'Nov' => 10, 'Dec' => 11 );
 
                $datestr = strftime("%a %b %d %T %Y", $6, $5, $4, $1, $mons{$2}, $3 - 1900);
            }

            $content[$i] =~ /([^<\s]*@[^>\s]*)/;
            my $em = defined($1) ? $1 : "nobody\@nowhere.com";
            print OUTPUT "From $em  $datestr\n";
            print OUTPUT $content[$i], "\n\n";
        }
        else
        {
            print OUTPUT qq(<PRE>\n);
            print OUTPUT qq(<A NAME=\"M$i\">);
            print OUTPUT qq(<FONT COLOR="#0000ff"><B>Date:      $date[$i]</B></FONT></A>\n);
            print OUTPUT qq(<FONT COLOR="#0000ff"><B>Subject:   $subject[$i]</B></FONT>\n);
            print OUTPUT qq(<FONT COLOR="#0000ff"><B>Newsgroup: $forum[$i]</B></FONT>\n);
            print OUTPUT qq(<FONT COLOR="#0000ff"><B>Author:    $author[$i]&nbsp;&lt;<A HREF="mailto:$author_email[$i]">$author_email[$i]</A>&gt;</B></FONT>\n);
            print OUTPUT qq(<FONT SIZE=-1><A HREF="#TOC$i">Top</A>&nbsp;);
            print OUTPUT qq(<A HREF="$thread[$i]" TARGET="_blank">Thread</A>&nbsp;);
            print OUTPUT qq(<A HREF="$fetch[$i]" TARGET="_blank">Message</A></FONT>\n\n);
            print OUTPUT qq(</PRE>\n);
            print OUTPUT $content[$i];
            if (!$SUMMARY_ONLY) { print OUTPUT qq(<HR>\n); }
        }
    }
}

# ----------------------------------------------------------------------------
# Download given URL using global agent
# Params: URL
# Returns: 1 if URL downloaded, 0 if unable to download URL
# ----------------------------------------------------------------------------       
sub GetURL
{
    my $url = shift;
    ( $resp_code, $resp_buf ) = getPage( parseURL( $url) );
    $resp_code=~m|^2| ? 1:0
}

# ----------------------------------------------------------------------------       
# Process news page for date, subject, newsgroup and content info.
# Params: page data
# Returns: status (-1 = duplicate), date, subject, newsgroup, content
# ----------------------------------------------------------------------------       
sub ProcessPage
{
    my ($page, $thread,   $subject,  $date,     $forum,    $author,   $author_email, $content) =
       (shift, "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown",     "");

    # We need to extract the content before removing the cr-lf sequences, since
    # the 'classic' interface uses preformatted content. Removing the crlf's
    # from the content section woul result in one long line of content instead
    # of the actual thing.
    #
    # Content detection
    if ($page =~ m|$deja{$interface}{'match_content'}|si)
    {
        $content = $1;
    }

    # Remove CR/LF from page
    if ($interface ne "mbox") { $page =~ s/[\r\n]/ /sig; }

    # Newsgroup detection
    if ($page =~ m|$deja{$interface}{'match_forum'}|si)
    {
        ($forum = $1) =~ s|,.*?$||sig;
    }

    # Thread detection
    if ($page =~ m|$deja{$interface}{'match_thread'}|si)
    {
        $thread = $1;
    }

    # Subject detection
    if ($page =~ m|$deja{$interface}{'match_subject'}|si)
    {
        $subject = $1;
    }

    # Date detection
    if ($page =~ m|$deja{$interface}{'match_date'}|si)
    {
        $date = $1;
    }

    # Author detection
    if ($page =~ m|$deja{$interface}{'match_author'}|si)
    {
        $author = $1;
        $author_email = $2;
    }

    # Check if message is from a newsgroup in the reject list
    foreach $i (@rejectList)
    {
        if ($forum =~ m|$i|i)
        {
            return (-1, $thread, $date, $subject, $forum, $author, $author_email, $content);
        }
    }

    # Check if subject is unknown
    if ($subject eq "Unknown" || $date eq "Unknown" || $forum eq "Unknown" || $content eq "")
    {
        if ($debug && $subject eq "Unknown") { print "Debug: Subject unknown!\n"; }
        if ($debug && $date eq "Unknown")    { print "Debug: Date unknown!\n";    }
        if ($debug && $forum eq "Unknown")   { print "Debug: Forum unknown!\n";   }
        if ($debug && $content eq "")        { print "Debug: Content unknown!\n"; }
        return (-1, $thread, $date, $subject, $forum, $author, $author_email, $content);
    }

    # Check if content is duplicate
    foreach $i (@index)
    {
        if ($content eq $content[$i])
        {
            print STDERR "Duplicate message discarded.\n";
            return (-1, $thread, $date, $subject, $forum, $author, $author_email, $content);
        }
    }
    
    return (0, $thread, $date, $subject, $forum, $author, $author_email, $content);
}

# ----------------------------------------------------------------------------       
# Sort messages according to newsgroup, subject and date (reverse) order.
# Params: none
# Returns: -1, 0 or 1 according to order
# ----------------------------------------------------------------------------       
sub SortMessages
{
    # First sort criteria is newsgroup
    if ($forum[$a] ne $forum[$b]) { return ($forum[$a] cmp $forum[$b]); }

    # Second sort criteria is subject (w/o "Re:")
    (my $s1 = $subject[$a]) =~ s|Re:\s*||;
    (my $s2 = $subject[$b]) =~ s|Re:\s*||;
    if ($s1 ne $s2) { return ($s1 cmp $s2); }
    
    # Third sort criteria is date
    if ($date[$a] ne $date[$b]) { return ($date[$b] cmp $date[$a]); }

    # If all criteria match, return 0
    return 0;
}

# ----------------------------------------------------------------------------       
# Contributions by Steffen Ullrich <coyote.frank@gmx.net>
# ----------------------------------------------------------------------------       
sub makeURL
{
    my ($P,$host,$port,$base ) = @_;
    if ( $P ) {
        if ($P=~m|^([a-zA-Z]+)://|) {
            my @urlp = parseURL($P);
            # return undef unless ($urlp[0] eq $host and $urlp[1] eq $port);
            $urlp[2] = "/" unless ($urlp[2]);
            $urlp[2] =~s|/+|/|g;
            return sprintf "http://%s:%d%s",@urlp;
        }
        return undef if ($P=~m|^mailto:|i);
        if ( $P!~m|^/|) {
            $base=~s|/[^/]*$||;
            $P=$base."/".$P;
            $P=~s|/+|/|g;
        }
    } else { $P = $base }
    sprintf "http://%s:%d%s",$host,$port,$P;
}

# ----------------------------------------------------------------------------
sub parseURL
{
    local $_ = shift;
    my ($host,$port,$page) = m|http://([^:/]+)(?::(\d+))?(/[^#]*)?|i;
    return () unless ($host);
    $port=80 unless ($port);
    $page="/" unless ($page);
    $host = lc($host);
    $page =~s|/+|/|g;
    ($host,$port,$page);
}

# ----------------------------------------------------------------------------
sub getPage
{  
    my ($host,$port,$page,$isProxy) = @_;
    if ($proxy and !$isProxy)
    {
        my @p = parseURL( $proxy );
        return getPage( $p[0],$p[1], makeURL( '',$host,$port,$page), 1 );
    }

    my $iaddr = inet_aton($host) || die "$host: $!";
    my $paddr = sockaddr_in($port,$iaddr);
    socket(SOCKET, PF_INET, SOCK_STREAM, getprotobyname('tcp')) or die $!;
    connect(SOCKET, $paddr) || die $!;
    my $s = select(SOCKET); $|=1; select($s);

    if ($isProxy && $auth ne "")
    {
        require MIME::Base64;
        print SOCKET "GET $page HTTP/1.0\r\n";
        $authStr = MIME::Base64::encode_base64($auth);
        print SOCKET "Proxy-Authorization: Basic $authStr\r\n\r\n";
    }
    else
    {
        print SOCKET "GET $page HTTP/1.0\r\n\r\n";
    }

    my ($ctyp, $code, $location) = ("", "", "");
    while (defined ($_=<SOCKET>))
    {
        last if /^\s*$/;
        $ctyp = lc($1), next if /^Content-type:\s*([^\s]+)/i;
        $location = $1, next if /^Location:\s*([^\s]+)/i;
        $code = $1, next if m|^HTTP/\d\.\d\s+(\d\d\d)|;
    }

    if ($code =~ m|^3| and $location)
    {
        $location = $deja{$interface}{'base'} . $location if (!($location =~ m|^http://|));
        $location = $1 . $3 . $2 . $4 if ($location =~ m|^(.*?)(\[S[^/]+/)(=dnc/)(.*?)$|);
        return getPage( parseURL( makeURL( $location,$host,$port,$page )));
    }
        
    my ($len,$buf,$bytes) = (0, "", 0,); $| = 1;

    while(1)
    {
        eval
        {
            local $SIG{ALRM} = sub { die "alarm\n" };
            alarm 10; # 10s timeout
            $bytes = read SOCKET, $buf, 1024, $len;
            alarm 0;
        };
        if ($@)
        {
            die unless $@ eq "alarm\n"; # propagate unexpected errors
            printf STDERR "Read timeout... Retrying...\n";
        }

        last if ($bytes == 0);
        $len+=$bytes;
        if ($status && $verbose) { printf STDERR "[%7d bytes]\r", $len; }
    }

    $ctyp =~ s|(.*);(.*)|$1|;
    ($code, $buf, $ctyp, $host, $port, $page);
}

# ----------------------------------------------------------------------------       
# Contributions by Frank de Lange <frank@unternet.org>
# ----------------------------------------------------------------------------       

# ---------------------------------------------------------------------------- 
# Construct the frame set
# ---------------------------------------------------------------------------- 
sub print_frameset
{
    $script_name = $request->script_name;
    print <<EOF;
    <html><head><title>$TITLE</title></head>
    <frameset framespacing=0 frameborder=0 rows="55,*">
    <frame src="$script_name/$request_frame" name="$request_frame" SCROLLING="No" NORESIZE>
    <frame src="$script_name/$response_frame" name="$response_frame">
    </frameset>
EOF
    ;
    exit 0;
}

sub print_message_frameset
{
    $script_name = $request->script_name;
    $myself = $request->self_url;
    print <<EOF;
    <frameset rows="30%,*">
    <frame src="$myself&FRAMED=1" name="$response_frame">
    <frame src="$script_name/$message_frame" name="$message_frame">
    </frameset>
EOF
    ;
    exit 0;
}

# ---------------------------------------------------------------------------- 
# Print HTML header
# ---------------------------------------------------------------------------- 
sub print_html_header
{
    print $request->start_html($TITLE);
}

# ---------------------------------------------------------------------------- 
# Print HTML trailer
# ---------------------------------------------------------------------------- 
sub print_end
{
    print $request->end_html;
}

# ---------------------------------------------------------------------------- 
# Print the request line
# ---------------------------------------------------------------------------- 
sub print_request
{
    $script_name = $request->script_name;

    print $request->startform(
        -action=>"$script_name/$response_frame",
        -TARGET=>"$response_frame");
    print "<p><a href='http://homemade.hypermart.net/dejasearch/'
           target='_blank'>DejaSearch</a>: ",
          $request->textfield(-name=>'QUERY', -size=>30);

    print "&nbsp;";

    print $request->submit('submit','Submit');

    print " Max ",$request->popup_menu(
        -name=>'MAXCOUNT',
        -values=>[5, 10, 15, 20, 25, 35, 50, 75, 100, 150, 200, 300, 400, 500],
        -default=>25);

    print $request->popup_menu(
        -name=>'SUMMARY_ONLY',
        -values=>['headers only','show messages'],
        -default=>'headers only');

    print $request->popup_menu(
        -name=>'TYPE',
        -values=>['all','recent', 'past'],
        -default=>'all');

    print $request->popup_menu(
        -name=>'INTERFACE',
        -values=>['new','classic'],
        -default=>'new');

    print $request->endform;
}

# ---------------------------------------------------------------------------- 
# Print the query results into the results frame
# ---------------------------------------------------------------------------- 
sub print_response
{
    unless ($request->param) {
        print "<BODY BGCOLOR=#ffffff>\n";
        print '<P><H3>Deja.com Search Language Quick Reference</H3>',
              '<TABLE WIDTH=600 BORDER=0 CELLPADDING=0, CELLSPACING=1>',
              '<TR><TD COLSPAN=4><B>Keywords can be separated by the following connectors:</B><BR><BR></TD></TR>',
              '<TR><TD WIDTH=50>&nbsp;</TD><TD>&</TD><TD><FONT SIZE=-1 COLOR="blue">- AND</FONT></TD><TD><TT>eg. beans & rice</TT></TD></TR>',
              '<TR><TD WIDTH=50>&nbsp;</TD><TD>|</TD><TD><FONT SIZE=-1 COLOR="blue">- OR</FONT></TD><TD><TT>eg. camel | llama</TT></TD></TR>',
              '<TR><TD WIDTH=50>&nbsp;</TD><TD>&!</TD><TD><FONT SIZE=-1 COLOR="blue">- AND NOT</FONT></TD><TD><TT>eg. clam &! chowder</TT></TD></TR>',
              '<TR><TD WIDTH=50>&nbsp;</TD><TD>^</TD><TD><FONT SIZE=-1 COLOR="blue">- NEAR</FONT></TD><TD><TT>eg. lucas ^spielberg</TT></TD></TR>',
              '<TR><TD COLSPAN=4><BR><B>Keywords can be combined with the following symbols:</B><BR><BR></TD></TR>',
              '<TR><TD WIDTH=50>&nbsp;</TD><TD>&quot;...&quot;</TD><TD><FONT SIZE=-1 COLOR="blue">- Quote Marks</FONT></TD><TD><TT>eg. "the far side"</TT></TD></TR>',
              '<TR><TD WIDTH=50>&nbsp;</TD><TD>*</TD><TD><FONT SIZE=-1 COLOR="blue">- Wildcard</FONT></TD><TD><TT>eg. psych*</TT></TD></TR>',
              '<TR><TD WIDTH=50>&nbsp;</TD><TD>(...)</TD><TD><FONT SIZE=-1 COLOR="blue">- Parentheses</FONT></TD><TD><TT>eg. scully & (xfiles | x-files)</TT></TD></TR>',
              '<TR><TD WIDTH=50>&nbsp;</TD><TD>{...}</TD><TD><FONT SIZE=-1 COLOR="blue">- Braces</FONT></TD><TD><TT>eg. {monkey monkeying}</TT></TD></TR>',
              '<TR><TD COLSPAN=4><BR><B>Keywords can be preceded by the following context operators:</B><BR><BR></TD></TR>',
              '<TR><TD WIDTH=50>&nbsp;</TD><TD>~a</TD><TD><FONT SIZE=-1 COLOR="blue">- Author</FONT></TD><TD><TT>eg. ~a demos@deja.com</TT></TD></TR>',
              '<TR><TD WIDTH=50>&nbsp;</TD><TD>~s</TD><TD><FONT SIZE=-1 COLOR="blue">- Subject</FONT></TD><TD><TT>eg. ~s chess</TT></TD></TR>',
              '<TR><TD WIDTH=50>&nbsp;</TD><TD>~g</TD><TD><FONT SIZE=-1 COLOR="blue">- Newsgroup</FONT></TD><TD><TT>eg. ~g alt.love</TT></TD></TR>',
              '<TR><TD WIDTH=50>&nbsp;</TD><TD>~dc</TD><TD><FONT SIZE=-1 COLOR="blue">- Creation date</FONT></TD><TD><TT>eg. ~dc 1996/12/31</TT></TD></TR>',
              '</TABLE>',
              '</BLOCKQUOTE>';
        return;
    }
    if ($request->param('SUMMARY_ONLY') eq 'headers only') {
        $SUMMARY_ONLY = 1;
	if (! $request->param('FRAMED') and ($frames)) {
            print_message_frameset();
            exit 0;
	  }
    }
    $type = $request->param('TYPE');
    $interface = $request->param('INTERFACE');
    $maxcount = $request->param('MAXCOUNT');
    $param = $request->param('QUERY');

    # Construct query string
    ConstructQueryString();

    # Prepare for search
    $count = 0;
    $spage = 1;

    # Check recent messages if necessary
    if ($type eq "all" || $type eq "recent") { DoSearch("dncurrent"); }

    # Check past messages if necessary
    if ($count < $maxcount && ($type eq "all" || $type eq "past")) { DoSearch("dnold"); }

    # Output search results!
    OutputResults();
}

# ---------------------------------------------------------------------------- 
# Print a single message
# ---------------------------------------------------------------------------- 
sub print_message
{
    unless ($request->param) {
        print "<BODY BGCOLOR=#ffffff>\n";
        return;
    }
    if ($request->param('FETCH')) {
        $index = 1;
        $fetch = $request->param('FETCH');
        $interface = $request->param('INTERFACE');
        GetURL($fetch) or die "Failed to retrieve $fetch:", $resp_code, "\n", $resp_buf;
        ($rc, $thread[0], $date[0], $subject[0], $forum[0],
            $author[0], $author_email[0], $content[0], $fetch[0]) = (ProcessPage($resp_buf), $fetch);
        $i = 0; $CGI = $SUMMARY_ONLY = 1; OutputMessage();
    }
}
