#!/bin/perl4.036 #
# webfind
#
#
$home=$ENV{HOME};
$debug=1;
$debug=$ENV{DEBUG}; # overrides default level of 1
# $|=1;
$scratchfile = "$$$.webfind";
#
require "chat2.pl";
require "url_parse.pl";
require "url_dump.pl";
require "url_get.pl";
require "url_exclude.pl";

$SIG{'INT'}='abend';
$SIG{'HUP'}='abend';
$SIG{'QUIT'}='abend';

$wfurl = "(file|http|ftp|news|telnet)\:\/\/(.+\.*)+(:[0-9]+)?(\/)*(.+\/*)+" ;
                                                        #well formed URL
$wfhst = "(file|http|ftp|news|telnet)\:\/\/(.+\.*)+(:[0-9]+)?\/" ;     #well formed server
$wfdn  = "\/\/(.+\.*)+\/" ;                 #well formed domain

$maxdepth = 3;
$nopush = $breadth = $start = $domain = ""; 
$host = $file = $grep = $agrep = $url = $print = "";

$sleepdur = $prerror = $depth = $get = $total_visited = $exec = "0";
$lasthost = "localhost"; # initial setting only!

undef %pretty;
undef %urlerrors;
#
@cmd_parms = @ARGV;
if ($debug >= "3") { print("the command line was: ","@cmd_parms","\n") };

$progname = $0;

if ($#ARGV <= 1) {
  print <<"%%";

  $0 -- search for stuff in the World Wide Web

  syntax: $0 -constraint [-constraint ...] 
 
  valid constraints are:
 
  -start [url]          start search from URL [url] (default: this host)
  -level n              max number of levels to traverse (default: 3)
  -depth | -breadth     do depth-first or breadth-first traversal
  -domain domain.org    limit search to domain (default: sco.com)
  -host [hostname]      limit search to host [hostname] (implies -start)
  -nice [number]        sleep [number] seconds between lookups 
  -file [string]        matches where filename contains [string]
  -grep [string]        matches for files containing [string]
  -url [string]         matches where file's URL contains [string]
                        (i.e. where file points to [string])
  -get                  dump all hits to local filesystem, substituting
                        http://localhost/ for current sitename
  -print [URL|rep|pret] print URL, or print parent, or print pretty.
                        URL simply prints URLs found. rep prints
                        parent->child tuples. pret prints neat
                        hierarchies.
  -error                print verbose error message if lookup fails
  -exec  {}[] \\;   exec program , URL replaces {}, temp filename
                        replaces [],  terminated by \;

%%
  exit(1) ;
}
#
# -- loop on @ARGV
#
foreach $i (0 .. $#ARGV) {
  $item = $ARGV[$i];
  $param = $ARGV[++$i] ;
  last if $item eq "--" ;
  if ($item eq "-start")    {
        $start = $param ;
        if ($start !~ /$wfurl/) { 
           print "ERROR -- ".$start." is not a recognized URL! \n" ;
       }
  } ;
  if ($item eq "-level")    {
        $maxdepth = $param ;
  } ;
  if ($item eq "-domain")   {
        $domain = $param ;
  } ;
  if ($item eq "-host")     {
        $host = $param ;
       if ($start eq "") { 
           $start = $host ;
       }
  } ;
  if ($item eq "-get") {
           $get = "1";
           unshift(@ARGV,$param);
           --$i;
   }
   if ($item eq "-error") {
           $prerror = "1";
           unshift(@ARGV,$param);
           --$i;
  }
  if ($item eq "-depth")        {
        if ($breadth ne "1") {
            $depth= "1";
            unshift(@ARGV,$param); 
            --$i;
        } else {
            print "ERROR! -depth specified after -breadth!\n\n";
            exit 1;
        }
  } ;
  if ($item eq "-breadth")        {
        if ($depth ne "1") {
            $breadth = "1";
            unshift(@ARGV,$param); 
            --$i;
        } else {
            print "ERROR! -breadth specified after -depth!\n\n";
            exit 1;
        }
  } ;
  if ($item eq "-file")     {
        $file = $param;
  } ;
  if ($item eq "-nice")    {
        $sleepdur = $param;
  } ;
  if ($item eq "-grep")     {
        $grep = $param;
  } ;
  if ($item eq "-url")      {
        $url = $param;
  } ;
  if ($item eq "-print")    {
        $param =~ tr/A-Z/a-z/;
        if ($param eq "url")  { $print = 1 } 
        elsif ($param eq "rep") { $print = 2 } 
        elsif ($param eq "pret") {$print = 3 }
    else { print "ERROR -- illegal print argument" } ; 
  } ;
  if ($item eq "-exec")     {
     # hairy bit -- fiddles with ARGV directly
     $numterms = 0 ;
     foreach $xarg ($i .. $#ARGV) {
        if ($ARGV[$xarg] eq ";") { last } ;
        $execline = $execline." ".$ARGV[$xarg] ;
        $numterms++ ;
     }
     $exec = $execline ;
     $_ = $exec;
     s/\{\}/$scratchfile/;
     $exec = $_ ;
     $i += $numterms ; 
     for ( $catchup = 1; $catchup <= $numterms; $catchup++) { shift }
 } ;
 $i++;
 shift;
}
# sanitize the default settings

$sleepdur = 5 unless defined($sleepdur);

if ( ($breadth == "") && ($depth == "") ) {$breadth="1"};
if ($debug >=4) {
    print "===========================\n";
    print ("\$start is set to ",$start,"\n");
    print ("\$maxdepth is set to ",$maxdepth,"\n");
    print ("\$domain is set to ",$domain,"\n");
    print ("\$host is set to ",$host,"\n");
    print ("\$file is set to ",$file,"\n");
    print ("\$grep is set to ",$grep,"\n");
    print ("\$get is set to ",$get,"\n");
    print ("\$url is set to ",$url,"\n");
    print ("\$print is set to ",$print,"\n");
    print ("\$depth is set to ",$depth,"\n");
    print ("\$breadth is set to ",$breadth,"\n");
    print ("\$exec is set to ",$exec,"\n");
    print ("\$prerror is set to ",$prerror,"\n");
}
if ($error ne "" ) {print "\n",$error,"\n"};

#&url_exclude'make_excludelist;
#
#if ($get eq "1") {
#    dbmopen(filestore,"filestore.dbm",0666) || 
#         print "Could not open dbm file\n";
#}
#
# %filestore is a dbm array containing all -get snarfed files
# BUG: this code commented out 'cause dbm has a hard limit of 1024 bytes
# per record. This means the idea of stashing files one per record needs
# a rethink unless I can come up with a copy of perl that uses gdbm
#----------------------- from here on, it walks the web ---------------


$n=1;

# -start argument overrides -host
# may need to add "index.html" to end of -host name

$start_depth=1;
if (($start == "") && ($host != "")) {
    $start = $host;
}
$prevurl = $start;
if ($print eq "2") {
    print "ROOT: $start\n";
}
&putnext($start,$start_depth);
$num_items = @tour_list /2; 

#---------------- Main loop

main_loop:
until ($num_items <= 0) {
    ($thisurl,$xdepth) = &getnext;
    $num_items = @tour_list /2; 
    if ($debug >=2 ) { print "depth: $xdepth\nurl: $thisurl\n";  }
    &do_break_actions;
    undef $doc; 
    undef @bannedurls;
    @bannedurls = &url_exclude'off_limits($thisurl, "websnarf");
    undef $bannedflag;
    if ($#bannedurls > 0) {
        foreach $fragment (@bannedurls) {
           chop $fragment;
           $bannedflag = index($thisurl,$fragment);
        }
    }
    if ($bannedflag > 0) {
       if ($prerror eq "1") { 
          print "Bannedflag: $bannedflag\n";
          print "EXCLUDED by NoRobots protocol: $thisurl \n"; 
       }
          next main_loop ;
    } else { 
        sleep($sleepdur);
        $doc= &url_get'url_get($thisurl) ;     
        if ($debug >=2) {
           print "got: \$thisurl is $thisurl, size:", length($thisurl), "\n";
       }
       $total_visited++;   
    }
    if (($doc eq "1") || ($doc =~ /404 Not Found/) || ($doc =~ /^40\d/)) {
        if ($prerror eq "1") {
            print "\nERROR: $thisurl returned\n---\n$doc\n---\n\n";
            $urlerrors{$thisurl} = $doc;
        }
        next main_loop;
    }
    next main_loop if $doc eq '1';
    next main_loop if $doc=~/Directory Index of/;
    next main_loop if $doc=~/404 Not Found/;
    
   $actresult = &do_actions;          # this calls the test loop

    resume:
    if ( $xdepth <= $maxdepth) {
           &addurls($doc, $thisurl, $xdepth);
    } 
    $prevurl = $thisurl;
}
#---------------------- end of mainloop ----------------
if ($debug >=2 ) {
    print "\n---\n\nRun finished!\n---\n\n";
    print "post-mortem queue dump:\n\n---\n@tour_list\n\n";
    print "Total push list: $totalpushed\n";
}

#if ($get eq "1") {
#    &makeget_tree;
#    dbmclose(%filestore);
#    exec "rm filestore.dbm";
#}

-e $scratchfile && exec "rm $scratchfile" ;
# &url_exclude'write_excludelist;
print "\nFinished:\n\nvisited $total_visited pages on the web\n";
if ($print eq "3") {&prettyprint };
exit(0);

#----------- END OF MAIN PROGRAM -----------------------------------

#sub makeget_tree {
#    while (($scanurl, $scanfile) = each %filestore) {
#        ($mprotocol, $mhost, $mport, $mrest1, $mrest2, $mrest3) = 
#             &url_parse'parse_url($scanurl);
#        if ($mprotocol eq "http") {
#             $newurl = "/".$mhost.$mrest1.$mrest2.$mrest3;
#             while (($transurl, $transfile) = each %filestore) {
#                $filestore{$transurl} =~ s/$scanurl/$newurl/gi ;
#             }
#        }
#    };
#    # now write out the frobnicated files
#    while (($scanurl, $scanfile) = each %filestore) {
#        if (substr($scanurl,-1,1) eq "/") {
#            $destname = "index.html";
#            chop $scanurl;
#        } 
#        undef @bits;
#        @bits = split(/\//,$scanurl);
#        $creatpath = ".";
#        for ($i=0; $i<$#bits;$i++) {
#            $creatpath = $creatpath."/".$bits[$i];
#            print "creating $creatpath \n";
#            mkdir($creatpath,777) || print "could not create $creatpath\n;" ;
#            $chresult = chmod 0755, "$creatpath";
#            print "chmod(); result $chresult on $creatpath\n";
#        }
#        $creatpath = $creatpath."/".$destname;
#        print "creating $creatpath\n";
#        open (DUMP, ">$creatpath") || print "could not open $creatpath\n";
#        select DUMP; $| = 1;
#        print DUMP $scanfile;
#        close DUMP; $| = 0;
#    }; # end of write loop
#};
#
#--------------- MAINLOOP EXIT CONDITION SCAN ------------------------

sub do_break_actions {

    # this lot is questionable since I grafted in the url parse library
    # not quite broken, but needs re-writing

    # return if ($thisurl =~ /\d+/) ; # break if stack's gotten messy

    ($dprotocol, $dhost, $dport, $drest1, $drest2, $drest3) = 
        &url_parse'parse_url($thisurl);

    if ($debug >=4 ) {
       print "Doing break actions: \n";
       print "thisurl: $thisurl\ndepth: ", $xdepth ,"\n";
    }

    # avoid infinite loops

    @been_here_before = grep(/$thisurl/,@tour_list);
    if ($#been_here_before > 10) { # we're stuck in a loop
        @foo = grep(!/$thisurl/, @tour_list);
        @tour_list = @foo;
        next main_loop;
        # we have now dumped all the repeating elements and reset the 
        # current URL
    } # end of loop handler

    # if we exceed the -depth argument, set nopush to true; else false

    if ($xdepth >= $maxdepth){
           $nopush = "true";
    } else {
           $nopush = "false";
    }

    # return if we've wandered outside the -domain (if -domain is set )

    if ( ($debug >= 4) && ($domain ne "") ) {
        print "domain limited to $domain: now in: $mysite\n" ;
    };
    
   # if  ($#tour_list > 0) {
   #         next main_loop if (($domain ne "") && ($dhost !~ /$domain/i)) ;
   #     } else {
   #         exit(1) if (($domain ne "") && ($dhost !~ /$domain/i)) ;
   # }

    # return if we've wandered away from the -host (if -host is set )

    if ($debug >= 4) {
        print "host limited to $host: now in: $thisurl\n" ;
    };

    next main_loop if (($host ne "") && ($dhost !~ /$host/i)) ;

    if ($debug >= 4) { print "Checked break actions -- no break\n";}
} 

#--------------- PREDICATE MATCHING LOOP ----------------------------

# unverified

sub do_actions {
    local($pres) = "0";
    # this loop permits the program to do find(1) style predicate
    # testing instead of normal (orthogonal) option handling -- if desired

#    foreach $predicate (@predicate) {
#        if ($predicate eq "-grep") { $pres = &do_grep };
#        if ($predicate eq "-get")  { $pres = &url_dump'do_get($debug, $thisurl, $doc) };
#        if ($predicate eq "-print"){ $pres = &do_print };
#        if ($predicate eq "-url")  { $pres = &do_url };
#        if ($predicate eq "-exec") { $pres = &do_exec };
#    }
#    goto resume ;

if ($grep  ne "") { &do_grep };
if ($get ne "0")  { 
    if ($debug >=2) { print "getting file \n";}
    $pres = &url_dump'do_get($debug, $thisurl, $doc) 
};
# if ($get ne "")  { $pres = &do_get($debug, $thisurl, $doc) };
# if ($print ne "") { &do_print };
if ($exec ne "") { &do_exec };
return $pres;
}

#--------------- PREDICATE FUNCTIONS ------------------------------

# unverified

sub do_grep {
    @hit=grep($grep, $file_text);
    if ($#hit != 0) {
       return 0
    } else {
       return 1
    }
}

sub do_print {

#      if ($print eq "1") { 
#         print("http://$mysite:$myport$mypath\n") && return 0;
#      } else {
#         if ($print eq "2") { 
#            print("$prevurl : $thisurl\n") && return 0;
#         };
#      }
      return 1;
}

sub do_url {
    if ( $url =~ $mysite) {
        if ($url =~ $mypath) {
            return 0
        } else {
            return 1
        }
    } else {
        return 1;
    }
}

sub do_exec {
    $hit = 1;
    $task= $exec;
    if ($task =~ /$scratchfile/) {
       open (SYSTST, ">$scratchfile"); $|=1;
       print SYSTST $doc;
       close SYSTST;
    }
    $urlname = $mysite.$mypath; # create a unique url name
    @urlnames = split("/",$urlname);
    $urlname = "";
    foreach $bit (@urlnames) {
       $urlname = $urlname."_".$bit;
    }

    $task =~ s/\[\]/$urlname/ ;    # subst $urlname for [] in $exec

    if ($task ne "0") { 
        $hit = system ($task) ;
    }
    return $hit/255;
    -e $scratchfile && exec "rm $scratchfile" ;
}


#------------------ SUPPORT FUNCTIONS ------------------------------

sub listen {
    local($secs) = @_;
    local($return,$tmp) = "";
    while (length($tmp = &chat'expect($secs, '(.|\n)+', '$&'))) {
        print $tmp if $trace;
        $return .= $tmp;
        (return $return) if (length($return) > 100000);
    }
    $return;
}

# unverified

sub getnext {
    local ($dep, $url, $tmp) = "";
    local (@ret);
    if ($depth == 1) {     # depth-first == stackwise
        $ret = pop(@tour_list);
	if ($debug >=4) {print "popped ";}
    } else {                 # breadth-first == queuewise
        if ($breadth == 1) {
            $ret = shift(@tour_list);
	    if ($debug >= 4) {print "shifted ";}
        }
    }
    ($url,$dep) = split(',',$ret);
    if ($debug >=4) {print "($url,$dep)\n";}
    if (($dep !~ /\d+/) ||
        ($url =~ /href=(\"*)([^\"]+(\"*)).*A>/gi) ) { 
        print "HORRID WEIRDNESS DETECTED IN &getnext(); KLUGING\n";
        $tmp1 = $dep; $url = $dep; $dep = $tmp1; 
    };
    return ($url,$dep);
}

sub putnext {
    local(@item)=@_ ;
    if ($#item != $[ + 1) {
        print "UNHOLY EVIL DETECTED!!! Wrong number of args in putnext()\n";
    }
    $element = join(',',@item);
    if ($debug >=4 ) {print "Pushing (url,depth) = ($element)\n";}
    push(@tour_list, $element) || print "putnext(); PUSH FAILED \n";
    $totalpushed++;
}

sub prettyprint {
    local ($numentities, $prtdex, $indent, $leader, $i, $target) = 0;
    local (@visitvec);
    print "Probed $maxdepth levels into the web, using ";
    if ($breadth == "1") { 
        print "breadth-first traversal\n";
    } else {
        print "depth-first traversal\n";
    };
    print "\n\nReport:\n\nURLs are listed, indented by levels beneath ";
    print "their parents.\n";
    print "Parents are indented by one arrow \"-->\" for each level down at\n";
    print "which they were encountered.\n\n";
    print "URLs surrounded by (brackets) were not retrieved by $progname.\n\n";
    print "--" x 30;
    print "\n\n";

    foreach $prtdex (keys(%pretty)) {
	undef @visitvec;
        @visitvec = split(/\+/,$pretty{$prtdex});
        $numentities = $#visitvec;
        $indent = $visitvec[0];
        $leader = "(".$visitvec[0].")";
        $leader .= '-->' x ($indent - 2);
        if (index($visitvec[1],"*") == $[) {
             $visitvec[1] =~ s/\*//; 
	 }
        print "\n$leader$visitvec[1]\n";
        $leader = '   ' x ($indent -1) ;
        $leader .= "|--";
        for ($i=2; $i<=$numentities;$i++) {
            $target = $visitvec[$i];
            if ($i == $numentities) { 
                $leader = '   ' x ($indent -1);
                $leader .= "+--";
            };
            if (index($target,"*") == $[) {
                $target =~ s/\*//; 
		print "$leader($target)\n";
            } else {
                if (defined $urlerrors{$target}) {
                    print "$leader$target *** ERROR ***\n";
                } else {
                    print "$leader$target\n";
                }
            }           
        }
    }
    print "\n----\n\nEnd of report\n\n----\n";
}
#--------------------- abort routine -----------------------------------

sub abend {
    print "ERROR: program terminated by signal!\n";
    &prettyprint;
    exit 1;

}

#--------------------- new, bulletproof stack-adding routine ------------

sub addurls {
      local ($file, $prev_url, $xdepth) = @_ ;
      local ($endflag) = "";
      local ($foo) = "";
      local ($previous) = $prev_url;

      $file =~ s/\n//i;
      $file =~ s/href/HREF/gi;
      $file =~ s/\/a>/\/A>/gi;
      if ($debug >=5) {print "addurls() called with \$previous:$previous\n";}
while ($endflag eq "") {
      # iteratively get URLs and process them
      #
      $start=index($file,"HREF");
      if ($start == -1) { $endflag = 1};
      $end=index($file,"\A>");
      if ($end == -1) { $endflag = 2};
      $thisurl =  substr($file,$start, ($end-$start)+2 );
      $file = substr($file,$end+2, (length($file)-($end+2)));
      $thisurl =~ s/\s//gi;
      if ($thisurl =~ s/HREF=\"//) {
	  $tmp1 = substr($thisurl,$[, index($thisurl,'"'));
      }
      if ($thisurl =~ /HREF=[^"]/) {
          $thisurl =~s/HREF=//;
          $tmp1 = substr($thisurl,$[, index($thisurl,'>'));
      }
      # $tmp1 now contains a URL. Check to see if it's partial
      #
      $tmp1 =~ s/http/http/i;
      $tmp1 =~ s/ftp/ftp/i;
      $tmp1 =~ s/gopher/gopher/i;
      $tmp1 =~ s/file/file/i;
      $tmp1 =~ s/news/news/i;
      $tmp1 =~ s/mailto/mailto/i;
      $tmp1 =~ s/telnet/telnet/i;
      $tmp1 =~ s/wais/wais/i;
      #
      if ($tmp1 !~ /((http)|(ftp)|(gopher)|(file)|(news)|(mailto)|(telnet)|(wais).+)/) {
         # make absolute URL using $prev_url
         #
         undef @dirstack;
         foreach $foo (split("/",$prev_url)) {
            if ($foo ne "") { 
                 push(@dirstack, $foo);
            };
         };
         if (substr($prev_url,-1) ne "/") {
             pop @dirstack;
         }
         @relative = split('/',$tmp1); 
         for ($i=$[; $i <= $#relative; $i++) {
             next if (@relative[$i] eq "");
             if (@relative[$i] =~ /\.\./) {
                 pop @dirstack;
             } else {
                 push(@dirstack, @relative[$i]);
             };
         };
         # dirstack now contains a stack of path elements
         #
         $tmp1 = '';
         foreach $elem (@dirstack) {
            $tmp1 = $tmp1.'/'.$elem;
         };
         $tmp1 = substr($tmp1,1,(length($tmp1)-1));
      };
      # $tmp1 now contains an absolute url. Lets do some post-prettifying
      #
      if ($tmp1 =~ /http/) {
          if (substr($tmp1, -1) eq "/") {
             $tmp1 = $tmp1.'index.html';
          };
          if (rindex($tmp1,'/') <= rindex($tmp1,'#')) {
             # we don't want no steeking local refs!!
             #
             $tmp1 = substr($tmp1, $[, (rindex($tmp1,'#')));
          };
      }
      $firstslash = index($tmp1,'/');
      if (substr($tmp1, ++$firstslash,1) ne '/') {
          $tmp1 = substr($tmp1,$[,$firstslash).'/'.substr($tmp1,$firstslash);
      }

      # ignore if it's a weird file

      next if $tmp1 =~ /\.mp(e)*g/i ;
      next if $tmp1 =~ /\.zip/i ;
      next if $tmp1 =~ /\.tar/i ;
      next if $tmp1 =~ /\.hqx/i ;
      next if $tmp1 =~ /\.ps/i ;
      next if $tmp1 =~ /\.gz/i ;
      next if $tmp1 =~ /\.bin/i ;

      # log, but don't push, if it's not readable

      if ($tmp1 =~  /wais:/i) {$nopush = "true"};
      if ($tmp1 =~  /news:/i) {$nopush = "true"};
      if ($tmp1 =~  /telnet:/i) {$nopush = "true"};
      if ($tmp1 =~  /.*\?.*=.*/i) {$nopush = "true"};
      if ($tmp1 =~  /.*\?.*/i) {$nopush = "true"};

      # don't bother retreiving wais, news, telnet, or query spaces
      
      if ($domain ne "") {
         if ($tmp1 !~ /$domain/i) { $nopush = "true" }; 
      }
      # don't visit URLs that lie outside a -domain argument
 
      if ($print eq "1") { 
        print("$tmp1\n") ;
      } else {
         if ($print eq "2") {
            if ($nopush ne "true") {
                print("$xdepth:$previous --> $tmp1\n") ;
            } else {
                print("$xdepth:$previous --> $tmp1 (not dereferenced)\n");
            }
         };
      }
      if ($print eq "3") {
         print "+";
         if ($pretty{$previous} eq "") {
            $pretty{$previous} = $xdepth;
         }
         #if (($pretty{$previous} =~ /$tmp1/) == 1 ) {
         #print "found $tmp1 in \$pretty\{$previous\}\n";
            if ($nopush ne "true") {
               $pretty{$previous} .= "+".$tmp1;
            } else {
               if ($nopush eq "true") {
                   $pretty{$previous} .= "+"."*".$tmp1;
               }
            }
        #}
      } # %pretty is indexed on URLs. Each entry contains a "+"-separated
        # string of values, starting with the depth at which the URL was 
        # first encountered, followed by all its children; children which
        # will not be visited are prefixed with an asterisk. Note that a
        # given URL should not appear in the list more than once. 

      if ($nopush eq "false") {
          &putnext($tmp1,($xdepth+1));
          # $prevurl = $tmp1;
          $num_items = @tour_list /2;
      }      
};

}