Robots: A Tutorial

CharlieSpider 1

[ Comments ] [ Copyright ] [ Main contents ]

Name

CharlieSpider

Usage

CharlieSpider http://host.domain.com/some/valid/url

Requirements

Perl 5.002
Net-FTP-1.18
libwww-perl-5.00

Description

CharlieSpider retrieves the specified file, parses it as HTML, expands any URLs present in the file, and issues HEAD requests via HTTP in order to test whether the links are valid. It then prints a brief report.

Limitations

CharlieSpider does not attempt to check the content-type of a retrieved document. It does not know how to handle non-HTML files. It does not know how to handle the SMTP protocol (unless you've installed the SMTP module for perl 5). And it is possible that it will give an error retrieving a file due to problems with network connectivity rather than document link infrastructure.

#!/usr/bin/perl -w

#
# Here are all the modules we need to source in and use!
#

use Config;
use URI::URL;
use HTML::Parse;
use HTTP::Response;
use HTTP::Status;
use LWP::Debug;
use LWP::UserAgent;
use LWP::Simple;

# Now we get our starting URL from the command line; implicitly, the first
# member of @_

my ($target) = shift || die &usage;

print << "%%";
CharlieSpider 0.1
Link integrity checker

%%

# declare program-specific variables

my ($BASE) = $target;              # base url for generating absolute web paths
my ($total_links)  = 0;            # total links found
my ($error_links) = 0;             # total links returning an HTTP error
my (%errors);                      # error tracking table
my (@doc_links, @exp_links);       # arrays to store found links

my ($ua) = new LWP::UserAgent ;    # create a new UserAgent object
$ua->agent('CharlieSpider/0.1');    # and tell it who it is
                                   # now create a new HTTP request
my ($req) = new HTTP::Request GET => $target;
my ($res) = $ua->request($req); # associate the request with the user agent 
 
# Check the outcome of the response
if ($res->is_success) {
   print "Retrieved $target\nParsing HTML";
   my($p) = parse_html($res->content);
   print "\nExtracting links\n";   # now loop over each link returned by
                                   # the extract_links method running on
                                   # the parsed html document, where the
                                   # link type is <A>
   for (@{ $p->extract_links(qw(a)) }) {
       my ($link) = @$_[$[];       # the link target is the first element of
                                   # the anonymous array pointed to by $_. Note
                                   # that these may be _relative_ URLs.
       push(@doc_links, $link);    # push it on our stack of links extracted
   }                               # from the document
   print "Issuing HEAD requests ";
   $| = 1;
                                   # we now want to issue a HEAD request for
                                   # each link, first expanding it into an
                                   # absolute URL -- so we traverse the parsed
                                   # HTML document tree, running the expand_urls
                                   # method on each element
                                   # (This is inefficient and is omitted in
                                   #  CharlieSpider/0.2)
   $p->traverse(\&expand_urls, 1); 
   for (@{ $p->extract_links(qw(a)) }) {
       my ($link) = @$_[$[];
       push(@exp_links, $link);    # @exp_links contains absolute URLs
                                   # now create a new UserAgent and issue
                                   # a HEAD request for the URL we just found
       my ($head) = new LWP::UserAgent;
       $head->agent('CharlieSpider/0.1');
       my ($head_req) = new HTTP::Request HEAD => $link;
       my ($head_res) = $head->request($head_req);
       if ($head_res->is_success) {
           print "+";              # The link is valid
           $total_links++;         # increment count of links visited
       } else {
           print "!" ;             # Something caused the HTTP request to fail
                                   # so push the link, keyed off its error,
                                   # onto the %errors hash of lists
           push(@{$errors{$head_res->code}}, $link);
           $error_links++;         # and increment the count of invalid links
       }
   }
   print "\n";
   my ($cnt) = 0;
   foreach (@exp_links) {          # we need to create an ordered table of
                                   # relative urls corresponding to each
                                   # absolute url we polled, in order of
                                   # traversal
       $expansion{$_} = $doc_links[$cnt++];
   }

   # now we can print a report!

   print "\n$target contains links to $total_links pages.\n\n";
   if ($error_links > 0) {
      print "Of these, $error_links links are broken.\n";
      print "Breakdown by error code:\n";
      while (($key, $value) = each (%errors)) {
         print "\n$key: ", status_message($key) ,"\n"; 
         foreach (@{$value}) {
            print "\n\t$_ " , " " x (40 - length($_)), "[", 
                  $expansion{$_} , "]";
         }
         print "\n";
      }
   } else {
      print "No links are broken.\n";
   }
   print "\n\n";

} else {
   print "Bad luck this time\n";
}
              
exit 0;

# end of main program

sub usage {
    print <<"%%"
Usage: get-file.pl http://host.somedomain.com/file/pathname
%%
}

sub expand_urls {

    # for a given HTML element, if it is a URL, expand it to an absolute URL
    # -- see the lwpcook.3 manpage (with the LWP distribution) for details

    my %link_elements = (
          'a'    => 'href',
          'img'  => 'src',
          'form' => 'action',
          'link' => 'href',
    );
    my($e, $start) = @_;
    return 1 unless $start;
    my $attr = $link_elements{$e->tag};
    return 1 unless defined $attr;
    my $url = $e->attr($attr);
    return 1 unless defined $url;
    $e->attr($attr, url($url, $BASE)->abs->as_string);
}