Robots: A Tutorial

CharlieSpider 3

[ Comments ] [ Copyright ] [ Main contents ]

Name

CharlieSpider/0.3

Usage

http://your.server.com/cgi-bin/spider.pl?target=http://target.server.com/

Requirements

Perl 5.002+
Net-FTP-1.18
libwww-perl-5.00
CGI.pm 2.0+

Description

CharlieSpider is invoked as a CGI script. If invoked with no parameters, it prints a form containing a text entry field for the target URL. If invoked with the 'target' parameter (for example, by entering a URL in the field), it retrieves the specified file, parses it as HTML, expands any URLs present in the file, and issues HEAD requests via HTTP in order to test whether the links are valid.

For each valid link which targets a document on the current server, it then repeats the process, until no more HTML documents are found in the local web.

Every time it encounters a new server, it downloads the /robots.txt file from the server. It obeys the robot exclusion protocol, and will not download files or make requests to servers from which it is forbidden.

(Note that this is a recursive-descendent variation on CharlieSpider/0.1.)

Limitations

CharlieSpider/0.2 checks the content-type of a retrieved document and rejects non-HTML files. It does not know how to handle the SMTP protocol (unless you've installed the SMTP module for perl 5). And it is possible that it will give an error retrieving a file due to problems with network connectivity rather than document link infrastructure.

If documents contain URLs pointing to internal sections, it will treat each of them as a separate document to link to.

#!/usr/bin/perl 

use Config;
use URI::URL;
use HTML::Parse;
use HTTP::Response;
use HTTP::Status;
use LWP::Debug;
use LWP::UserAgent;
use LWP::Simple;
use CGI;
use CGI::Carp;
use WWW::RobotRules;

my($q) = new CGI;
my ($rlz) = new WWW::RobotRules 'CharlieSpider';
my (%visited);
my (%broken);
my (%hosts);
$| = 1;
$| = 1; # flush headers now

# print $q->header('HTTP/1.0 200 OK');
print $q->header( -type => 'text/html',
                  -status => '200 OK',
                  );
print "\r\n";

print $q->start_html( -title=>'CharlieSpider/0.3',
                      -author=>'charlie@antipope.org',
                      -BGCOLOR=>'#F0F0F0' );

print $q->h1('CharlieSpider/0.3');
print $q->h2('Recursive-descent link checker');
print $q->p('Robot exclusion protocol-aware version');
print $q->hr;


print "<PRE>\n";

$q->param('target')? check_page($q->param('target'), \%visited, \%broken, \%hosts, $rlz) : usage();

print "</PRE>\n";
print $q->end_html;

exit 0;

sub check_page {
   my ($target) = shift @_;
   my ($visited) = shift @_;
   my ($broken) = shift @_;
   my ($hosts) = shift @_;
   my ($rlz) = shift @_;
   my ($BASE) = $target;
   my ($total_links)  = 0;
   my ($error_links) = 0;
   my ($forbidden_links) = 0;
   my (%errors);
   my (@doc_links, @exp_links);

   if (robocheck($target, $hosts, $rlz) == 1) {
       print "\n$target\n\tNOT CHECKED: Robot exclusion in force\n\n";
       return;
   };

   my ($ua) = new LWP::UserAgent ;
   $ua->agent('CharlieSpider/0.3');
   my ($req) = new HTTP::Request GET => $target;
   my ($res) = $ua->request($req);
    
   # Check the outcome of the response
   if ($res->is_success) {
      $visited->{$target}++;
      print "$target\n";
      if ($res->content_type !~ /html/i) {
          print "\t-- not an HTML document\n\n";
          return;
      }
      my($p) = parse_html($res->content);
      for (@{ $p->extract_links(qw(a)) }) {
          my ($link) = @$_[$[];
          push(@doc_links, $link);
          $total_links++;
          undef $link;
      }
      print "\tIssuing HEAD requests ";
      for (@{ $p->extract_links(qw(a)) }) {
          my ($link) = url(@$_[$[], $BASE)->abs->as_string ;
          push(@exp_links, $link);
          if (robocheck($link, $hosts, $rlz) == 1 ) {
              print "!";
              $forbidden_links++;
          } else {
              my ($head) = new LWP::UserAgent;
              $head->agent('CharlieSpider/0.3');
              my ($head_req) = new HTTP::Request HEAD => $link;
              my ($head_res) = $head->request($head_req);
              if ($head_res->is_success) {
                  print "+"; 
                  # $visited->{$link}++;
              } else {
                  print  "-";  
                  push(@{$errors{$head_res->code}}, $link);
                  $broken->{$link}++;
                  $error_links++;
              }
          }
      }
      print "\n";
      my ($cnt) = 0;
      foreach (@exp_links) {
             $expansion{$_} = $doc_links[$cnt++];
      }
    
      print "\tFound links to $total_links pages.\n";
      if ($error_links > 0) {
         print "\tOf these, $error_links links are broken.\n";
         print "\t$forbidden_links were forbidden by robots.txt.\n";
         print "\tBreakdown by error code:\n";
         while (($key, $value) = each (%errors)) {
            print "\n\t$key: ", status_message($key) ,"\n"; 
            foreach (@{$value}) {
                  print "\n\t\t$expansion{$_}\n\t\t[ GET $_]";
            }
            print "\n";
         }
      } else {
         print "\tNo links are broken.\n";
         print "\t$forbidden_links were forbidden by robots.txt.\n";
      }
      print "\n\n";
      foreach (@exp_links) {
          my ($next) =  url($_);
          next if (defined $next->frag);
          next if (defined $next->query);
          if (( $next->host eq url($BASE)->host) &&
              ( ! defined ($broken->{$next->as_string}) ) &&
              ( ! defined ($visited->{$next->as_string}) )) {
                  if (robocheck( url($_, $BASE)->abs->as_string, 
                                 $hosts, $rlz) == 1) {
                      print "\n$_\n\tNOT CHECKED: robot exclusion in force\n\n";
                  } else {
                      check_page($_, $visited, $broken, $hosts, $rlz);
                  }
          }
      }
   } else {
      print "\n$target failed (", status_message($res->code),")\n";
   }
} # check_page()
exit 0;

sub robocheck {
   my ($target) = shift @_;
   my ($hosts) = shift @_;
   my ($rlz) = shift @_;
   if (! defined $hosts->{url($target)->host}) {
       my ($robotfile) = url("/robots.txt", $target)->abs->as_string;
       if (url("/robots.txt", $target)->abs->scheme eq "http") {
           # print "Fetching $robotfile for ", url($target)->host, "\n";
           my ($robots_txt) = get $robotfile || warn "Something went wrong\n";
           $rlz->parse($robotfile, $robots_txt);
           $hosts->{url($target)->host}++;
       }
   }
   if ($rlz->allowed($target)) {
       # print "robots.txt: allowed($target)\n";
       return 0;
   } else {
       # print "robots.txt: NOT allowed($target)\n";
       return 1;
   }
}

sub usage {
    print $q->startform(-method => 'get',
                        -action => $q->url);

    print $q->p("Enter the root URL for the web you want to check:");
    print $q->textfield(-name => 'target',
                        -size => '50',
                        -maxlength => '80');
    print $q->submit(-name => 'Check weblet');
    print $q->reset(-name => 'Reset');

    print $q->endform;

}