Robots: A Tutorial

CharlieSpider 2

[ Comments ] [ Copyright ] [ Main contents ]

Name

CharlieSpider/0.2

Usage

CharlieSpider http://host.domain.com/some/valid/url

Requirements

Perl 5.002
Net-FTP-1.18
libwww-perl-5.00

Description

CharlieSpider retrieves the specified file, parses it as HTML, expands any URLs present in the file, and issues HEAD requests via HTTP in order to test whether the links are valid.

For each valid link which targets a document on the current server, it then repeats the process, until no more HTML documents are found in the local web.

It then prints a brief report.

(Note that this is a recursive-descendent variation on CharlieSpider/0.1.)

Limitations

CharlieSpider/0.2 does checks the content-type of a retrieved document and rejects non-HTML files. It does not know how to handle the SMTP protocol (unless you've installed the SMTP module for perl 5). And it is possible that it will give an error retrieving a file due to problems with network connectivity rather than document link infrastructure.

If documents contain URLs pointing to internal sections, it will treat each of them as a separate document to link to.


#!/usr/bin/perl -w

use Config;
use URI::URL;
use HTML::Parse;
use HTTP::Response;
use HTTP::Status;
use LWP::Debug;
use LWP::UserAgent;
use LWP::Simple;


print lt;lt; "%%";
CharlieSpider 0.2
Recursive-descent link checker

%%
my (%visited);
my (%broken);
my ($target) = shift || die &usage;
$| = 1;
check_page($target, \%visited);
exit 0;

sub check_page {
   my ($target) = shift @_;
   my ($visited) = shift @_;
   my ($broken) = shift @_;
   my ($BASE) = $target;
   my ($total_links)  = 0;
   my ($error_links) = 0;
   my (%errors);
   my (@doc_links, @exp_links);
   my ($ua) = new LWP::UserAgent ;
   $ua->agent('CharlieSpider/0.1');
   my ($req) = new HTTP::Request GET => $target;
   my ($res) = $ua->request($req);
    
   # Check the outcome of the response
   if ($res->is_success) {
      $visited->{$target}++;
      print "$target\n";
      if ($res->content_type !~ /html/i) {
          print "\t-- not an HTML document\n\n";
          return;
      }
      my($p) = parse_html($res->content);
      for (@{ $p->extract_links(qw(a)) }) {
          my ($link) = @$_[$[];
          push(@doc_links, $link);
          $total_links++;
          undef $link;
      }
      print "\tIssuing HEAD requests ";
      for (@{ $p->extract_links(qw(a)) }) {
          my ($link) = url(@$_[$[], $BASE)->abs->as_string ;
          push(@exp_links, $link);
          my ($head) = new LWP::UserAgent;
          $head->agent('CharlieSpider/0.1');
          my ($head_req) = new HTTP::Request HEAD => $link;
          my ($head_res) = $head->request($head_req);
          if ($head_res->is_success) {
              print "+"; 
              # $visited->{$link}++;
          } else {
              print  "!";  
              push(@{$errors{$head_res->code}}, $link);
              $broken->{$link}++;
              $error_links++;
          }
      }
      print "\n";
      my ($cnt) = 0;
      foreach (@exp_links) {
             $expansion{$_} = $doc_links[$cnt++];
      }
    
      print "\tFound links to $total_links pages.\n";
      if ($error_links > 0) {
         print "\tOf these, $error_links links are broken.\n";
         print "\tBreakdown by error code:\n";
         while (($key, $value) = each (%errors)) {
            print "\n\t$key: ", status_message($key) ,"\n"; 
            foreach (@{$value}) {
                  print "\n\t\t$expansion{$_}\n\t\t[ GET $_]";
            }
            print "\n";
         }
      } else {
         print "\tNo links are broken.\n";
      }
      print "\n\n";
      foreach (@exp_links) {
          my ($next) =  url($_);
          next if (defined $next->frag);
          next if (defined $next->query);
          if (( $next->host eq url($BASE)->host) &&
              ( ! defined ($broken->{$next->as_string}) ) &&
              ( ! defined ($visited->{$next->as_string}) )) {
                  check_page($_, $visited, $broken);
          }
      }
   } else {
      print "\n$target failed (", status_message($res->code),")\n";
   }
} # check_page()
exit 0;


sub usage {
    print lt;lt;"%%"
    Usage: linkcheck.pl http://host.somedomain.com/file/pathname
%%
}