#!/home/charlie/perl
require "chat2.pl";  
require "url_parse.pl";
package url_exclude;

$tmout = 60;

sub off_limits {
    local($url, $app) = @_;
    local(@prohibited);
    local($server, $port, $path, $robotfile, $atype, $ret, $prohibit);
    local($in_agent) = "0";
    if ($url =~ m#^http://[^/]+:70[^0-9]#i)
    { # gopher masquerading as http, asume okay
      return 0;
    }

    # Parse URL
    ($protocol, $server, $port, $rest1, $rest2, $rest3) = 
           &url_parse'parse_url($url);
    $path = $rest1.$rest2.$rest3;

    if ($port eq "") { $port = 80 ;}
    if ($protocol =~ /^http/) { 
      $atype = $robotaccess{$server}; # "", "ok", or "disallow paths"
      # If we haven't cached a Robot Exclusion file, check it
      if (length ($atype) < 1) {
          $robotfile = "/robots.txt";
          $rname = '.robots/' . $server;
          if (-e $rname) { 
               if ($main'debug >=4) {print "Already had $rname\n"; }
               open(ROBOT, "<$rname") ; 
               select (ROBOT);
               undef $ret;
               while (<ROBOT>) {
	           $ret = $ret.$_ ;
               };
               close ROBOT;  
               select STDOUT;
          } else { 
               if ($main'debug >=4) {print "; Looking for $rname\n";}
               $handle =&chat'open_port($server, $port) || do { 
                   print STDERR "*** Norobots check failed: $server:$port ***";
                };
                &chat'print("GET $robotfile\n");
                $ret=&main'listen($tmout);  
                &chat'close($handle);
                if (($ret =~ /404 Not Found/) ||
                    ($ret =~ /Error code 403/) ||
                    ($ret =~ /Bad File Request/)) {
                    $ret = "OK"
                };
                unless ( -d '.robots' && -w '.robots' && -x '.robots') {
                   if ($main'debug >=2) {
                       print ".robots directory does not exist; creating\n";
                   }
                   mkdir('.robots',0777);             
                   $chresult = chmod 0755, '.robots';
                } else {
                   if ($main'debug >= 4) {
                       print "We have a .robots directory!\n";
                   }
                }

                open (ROBOT, ">$rname") || die "cannot open $rname\n";
                select (ROBOT); $| = 1;
                print ROBOT $ret;
                close (ROBOT); $|=0;
                select STDOUT;
            }# length($atype) <1

          @ret = split(/\n/,$ret);       
          
          foreach (@ret) { 
             if ($_ =~ /^OK$/) { return "OK" };
             s/#.*$//i;
             if (($_ =~ /^User-agent:/) && 
                 ($_ =~/$app/i)||($_ =~/\*/)) { 
                # check to see if it's about us; if so, switch in_agent on
                #
                $in_agent = "1";
 	        $atype = "disallow";
             } else {
                # not us; therefore switch in_agent off
                #
                if (($_ =~ /User-agent:/) && ($_ !~ /$app/i)) {
                   $in_agent = "0";
                }
             } # end of user-agent toggle check
	     if ($in_agent == "1") {
             # if the in_agent toggle is ON ..
             #
                if (/Disallow:\s*(.*\S)/io) {
                   # add Disallow: subjects to atype
                   #
                   $atype .= " $1";
                }             
	     }
          }  
         $robotaccess{$server} = $atype;
      }# server == http:

      # Loop through paths, checking for prefix
      foreach $prohibit (split (/\s+/, $atype)) {
	  unless ($prohibit eq "disallow") {
	      push(@prohibited, $prohibit);
          }
          if (length ($prohibit) > 0 && $path =~ /$prohibit/) {
             if ($main'debug >=4) {
                 print "; access disallowed for robots: $path\n";
             }
             return (@prohibited);
          }
      }
      
      if ($main'debug >=4) {print "access allowed for robots\n";}
      return (@prohibited);
    }
    else
    { return ("ERROR"); }             # no server, can't check, assume okay
}

# ----------------------------------------------------------------
# Main program example

#$url = $ARGV[0];

#if (@retvar = &off_limits ($url, "fish"))
#{ print "Returned:\n---\n@retvar\n--- \n"; }
#else
#{ print "something went wrong\n"; }


#sub listen {
#    local($secs) = @_;
#    local($return,$tmp) = "";
#    while (length($tmp = &chat'expect($secs, '(.|\n)+', '$&'))) {
#        print $tmp if $trace;
#        $return .= $tmp;
#        (return $return) if (length($return) > 100000);
#    }
#    $return;
#}
