#!/usr/bin/perl 

use Config;
use URI::URL;
use HTML::Parse;
use HTTP::Response;
use HTTP::Status;
use LWP::Debug;
use LWP::UserAgent;
use LWP::Simple;
use CGI;

my($q) = new CGI;

$| = 1; # flush headers now
# print $q->header('HTTP/1.0 200 OK');
print $q->header( -type => 'text/html',
                  -status => '200 OK',
                  );
print "\r\n";

print $q->start_html( -title=>'CharlieSpider/0.2',
                      -author=>'charlie@antipope.org',
                      -BGCOLOR=>'#F0F0F0' );

print $q->h1('CharlieSpider/0.2');
print $q->h2('Recursive-descent link checker');
print $q->p('&copy; Charlie Stross, 1996');
print $q->hr;

my (%visited);
my (%broken);
$| = 1;

print "<PRE>\n";

$q->param('target') ? check_page($q->param('target'), \%visited) : usage();

print "</PRE>\n";
print $q->end_html;

exit 0;

sub check_page {
   my ($target) = shift @_;
   my ($visited) = shift @_;
   my ($broken) = shift @_;
   my ($BASE) = $target;
   my ($total_links)  = 0;
   my ($error_links) = 0;
   my (%errors);
   my (@doc_links, @exp_links);
   my ($ua) = new LWP::UserAgent ;
   $ua->agent('CharlieSpider/0.1');
   my ($req) = new HTTP::Request GET => $target;
   my ($res) = $ua->request($req);
    
   # Check the outcome of the response
   if ($res->is_success) {
      $visited->{$target}++;
      print "$target\n";
      if ($res->content_type !~ /html/i) {
          print "\t-- not an HTML document\n\n";
          return;
      }
      my($p) = parse_html($res->content);
      for (@{ $p->extract_links(qw(a)) }) {
          my ($link) = @$_[$[];
          push(@doc_links, $link);
          $total_links++;
          undef $link;
      }
      print "\tIssuing HEAD requests ";
      for (@{ $p->extract_links(qw(a)) }) {
          my ($link) = url(@$_[$[], $BASE)->abs->as_string ;
          push(@exp_links, $link);
          my ($head) = new LWP::UserAgent;
          $head->agent('CharlieSpider/0.1');
          my ($head_req) = new HTTP::Request HEAD => $link;
          my ($head_res) = $head->request($head_req);
          if ($head_res->is_success) {
              print "+"; 
              # $visited->{$link}++;
          } else {
              print  "!";  
              push(@{$errors{$head_res->code}}, $link);
              $broken->{$link}++;
              $error_links++;
          }
      }
      print "\n";
      my ($cnt) = 0;
      foreach (@exp_links) {
             $expansion{$_} = $doc_links[$cnt++];
      }
    
      print "\tFound links to $total_links pages.\n";
      if ($error_links > 0) {
         print "\tOf these, $error_links links are broken.\n";
         print "\tBreakdown by error code:\n";
         while (($key, $value) = each (%errors)) {
            print "\n\t$key: ", status_message($key) ,"\n"; 
            foreach (@{$value}) {
                  print "\n\t\t$expansion{$_}\n\t\t[ GET $_]";
            }
            print "\n";
         }
      } else {
         print "\tNo links are broken.\n";
      }
      print "\n\n";
      foreach (@exp_links) {
          my ($next) =  url($_);
          next if (defined $next->frag);
          next if (defined $next->query);
          if (( $next->host eq url($BASE)->host) &&
              ( ! defined ($broken->{$next->as_string}) ) &&
              ( ! defined ($visited->{$next->as_string}) )) {
                  check_page($_, $visited, $broken);
          }
      }
   } else {
      print "\n$target failed (", status_message($res->code),")\n";
   }
} # check_page()
exit 0;


sub usage {
    print $q->startform(-method => 'get',
                        -action => $q->url);

    print $q->p("Enter the root URL for the web you want to check:");
    print $q->textfield(-name => 'target',
                        -size => '50',
                        -maxlength => '80');
    print $q->submit(-name => 'Check weblet');
    print $q->reset(-name => 'Reset');

    print $q->endform;

}

