#!/home/charlie/perl

# ------------------------------------------------------------------------
# check for server that doesn't want robot accesses

require "/u/charless/perl/snarf/chat2.pl";
require "/u/charless/perl/snarf/url_parse.pl";

$tmout = 60;

sub off_limits {
    local($url, $app) = @_;
    local(@prohibited);
    local($server, $port, $path, $robotfile, $atype, $ret, $prohibit);
    local($in_agent) = "0";
    if ($url =~ m#^http://[^/]+:70[^0-9]#i)
    { # gopher masquerading as http, asume okay
      return 0;
    }

    # Parse URL

#    ($server, $port, $path) = ($url =~ m#^([a-z]+://[^/]+(:[0-9]+)?)(.*)#i);
    ($protocol, $server, $port, $rest1, $rest2, $rest3) = 
           &url_parse'parse_url($url);
    $path = $rest1.$rest2.$rest3;
print "server: $server\nport: $port\npath: $path\n";    

    if ($port eq "") { $port = 80 ;}
    $server =~ tr/A-Z/a-z/;

    if ($protocol =~ /^http/) { 
      $atype = $robotaccess{$server}; # "", "ok", or "disallow paths"
      # If we haven't cached a Robot Exclusion file, check it
      if (length ($atype) < 1) {
          $robotfile = "/robots.txt";
          $rname = '.robots/' . $server;
          if (-e $rname) { 
               print "Already had $rname\n"; 
               open(ROBOT, "<$rname") ; 
               select (ROBOT);
               undef $ret;
               while (<ROBOT>) {
	           $ret = $ret.$_ ;
               };
               close ROBOT;  
               select STDOUT;
          } else { 
               print "; Looking for $rname\n";
               $handle =&chat'open_port($server, $port) || do { 
                   print "*** Norobots failed: $server:$port ***";
                   exit 1;
                };
                &chat'print("GET $robotfile\n");
                $ret=&listen($tmout);  
                &chat'close($handle);
                unless ( -d '.robots' && -w '.robots' && -x '.robots') {
                   print ".robots directory does not exist; creating\n";
                   mkdir('.robots',0777);             
                   $chresult = chmod 0755, '.robots';
                } else {
                   print "We have a .robots directory!\n";
                }

                open (ROBOT, ">$rname") || die "cannot open $rname\n";
                select (ROBOT); $| = 1;
                print ROBOT $ret;
                close (ROBOT); $|=0;
                select STDOUT;
            }# length($atype) <1

          @ret = split(/\n/,$ret);       
          MAIN:
          foreach (@ret) { 
             s/#.*$//i;
             if (($_ =~ /^User-agent:/) && 
                 ($_ =~/$app/i)||($_ =~/\*/)) { 
                # check to see if it's about us; if so, switch in_agent on
                #
                $in_agent = "1";
 	        $atype = "disallow";
             } else {
                # not us; therefore switch in_agent off
                #
                if (($_ =~ /User-agent:/) && ($_ !~ /$app/i)) {
                   $in_agent = "0";
                }
             } # end of user-agent toggle check
	     if ($in_agent == "1") {
             # if the in_agent toggle is ON ..
             #
                if (/Disallow:\s*(.*\S)/io) {
                   # add Disallow: subjects to atype
                   #
                   $atype .= " $1";
                }             
	     }
          }  
     # }

# print "parsing \$ret: $_\n";
#            if (($_ =~/^User-agent:.*\W$app\W/io) || 
#                ($_ =~ /^User-agent:\s*[*]/io)) { 
#print "-- found $app or User-agent: in this line\n";
#               $atype = "disallow";
#               while () {
#                  if (/Disallow:\s*(.*\S)/io) {
#                     $atype .= " $1"; 
#                  } else {
#                     last MAIN; 
#                  }
#               }  
#    
#               last MAIN;


        $robotaccess{$server} = $atype;
        print "; robot access: \n---\n$atype\n---\n";
      }# server == http:

      # Loop through paths, checking for prefix
print "Got here ...\n";
print "atype is\n--\n$atype\n--\n";
      foreach $prohibit (split (/\s+/, $atype)) {
	  unless ($prohibit eq "disallow") {
	      push(@prohibited, $prohibit);
          }
print "prohibit: $prohibit\n";
        if (length ($prohibit) > 0 && $path =~ /$prohibit/) {
          print "; access disallowed for robots: $path\n";
          return (@prohibited);
        }
      }
      
      print "; access allowed for robots\n";
      return (@prohibited);
    }
    else
    { return ("ERROR"); }             # no server, can't check, assume okay
}

# ----------------------------------------------------------------
# Main program

$url = $ARGV[0];

if (@retvar = &off_limits ($url, "fish"))
{ print "Returned:\n---\n@retvar\n--- \n"; }
else
{ print "something went wrong\n"; }


sub listen {
    local($secs) = @_;
    local($return,$tmp) = "";
    while (length($tmp = &chat'expect($secs, '(.|\n)+', '$&'))) {
        print $tmp if $trace;
        $return .= $tmp;
        (return $return) if (length($return) > 100000);
    }
    $return;
}
