#!/bin/perl4.036 #
# webfind
#
#
$home=$ENV{HOME};
$debug=1;
$debug=$ENV{DEBUG}; # overrides default level of 1
# $|=1;
$scratchfile = "$$$.webfind";
#
require "chat2.pl";
require "url_parse.pl";
require "url_dump.pl";
require "url_get.pl";
require "url_exclude.pl";
$SIG{'INT'}='abend';
$SIG{'HUP'}='abend';
$SIG{'QUIT'}='abend';
$wfurl = "(file|http|ftp|news|telnet)\:\/\/(.+\.*)+(:[0-9]+)?(\/)*(.+\/*)+" ;
#well formed URL
$wfhst = "(file|http|ftp|news|telnet)\:\/\/(.+\.*)+(:[0-9]+)?\/" ; #well formed server
$wfdn = "\/\/(.+\.*)+\/" ; #well formed domain
$maxdepth = 3;
$nopush = $breadth = $start = $domain = "";
$host = $file = $grep = $agrep = $url = $print = "";
$sleepdur = $prerror = $depth = $get = $total_visited = $exec = "0";
$lasthost = "localhost"; # initial setting only!
undef %pretty;
undef %urlerrors;
#
@cmd_parms = @ARGV;
if ($debug >= "3") { print("the command line was: ","@cmd_parms","\n") };
$progname = $0;
if ($#ARGV <= 1) {
print <<"%%";
$0 -- search for stuff in the World Wide Web
syntax: $0 -constraint [-constraint ...]
valid constraints are:
-start [url] start search from URL [url] (default: this host)
-level n max number of levels to traverse (default: 3)
-depth | -breadth do depth-first or breadth-first traversal
-domain domain.org limit search to domain (default: sco.com)
-host [hostname] limit search to host [hostname] (implies -start)
-nice [number] sleep [number] seconds between lookups
-file [string] matches where filename contains [string]
-grep [string] matches for files containing [string]
-url [string] matches where file's URL contains [string]
(i.e. where file points to [string])
-get dump all hits to local filesystem, substituting
http://localhost/ for current sitename
-print [URL|rep|pret] print URL, or print parent, or print pretty.
URL simply prints URLs found. rep prints
parent->child tuples. pret prints neat
hierarchies.
-error print verbose error message if lookup fails
-exec {}[] \\; exec program , URL replaces {}, temp filename
replaces [], terminated by \;
%%
exit(1) ;
}
#
# -- loop on @ARGV
#
foreach $i (0 .. $#ARGV) {
$item = $ARGV[$i];
$param = $ARGV[++$i] ;
last if $item eq "--" ;
if ($item eq "-start") {
$start = $param ;
if ($start !~ /$wfurl/) {
print "ERROR -- ".$start." is not a recognized URL! \n" ;
}
} ;
if ($item eq "-level") {
$maxdepth = $param ;
} ;
if ($item eq "-domain") {
$domain = $param ;
} ;
if ($item eq "-host") {
$host = $param ;
if ($start eq "") {
$start = $host ;
}
} ;
if ($item eq "-get") {
$get = "1";
unshift(@ARGV,$param);
--$i;
}
if ($item eq "-error") {
$prerror = "1";
unshift(@ARGV,$param);
--$i;
}
if ($item eq "-depth") {
if ($breadth ne "1") {
$depth= "1";
unshift(@ARGV,$param);
--$i;
} else {
print "ERROR! -depth specified after -breadth!\n\n";
exit 1;
}
} ;
if ($item eq "-breadth") {
if ($depth ne "1") {
$breadth = "1";
unshift(@ARGV,$param);
--$i;
} else {
print "ERROR! -breadth specified after -depth!\n\n";
exit 1;
}
} ;
if ($item eq "-file") {
$file = $param;
} ;
if ($item eq "-nice") {
$sleepdur = $param;
} ;
if ($item eq "-grep") {
$grep = $param;
} ;
if ($item eq "-url") {
$url = $param;
} ;
if ($item eq "-print") {
$param =~ tr/A-Z/a-z/;
if ($param eq "url") { $print = 1 }
elsif ($param eq "rep") { $print = 2 }
elsif ($param eq "pret") {$print = 3 }
else { print "ERROR -- illegal print argument" } ;
} ;
if ($item eq "-exec") {
# hairy bit -- fiddles with ARGV directly
$numterms = 0 ;
foreach $xarg ($i .. $#ARGV) {
if ($ARGV[$xarg] eq ";") { last } ;
$execline = $execline." ".$ARGV[$xarg] ;
$numterms++ ;
}
$exec = $execline ;
$_ = $exec;
s/\{\}/$scratchfile/;
$exec = $_ ;
$i += $numterms ;
for ( $catchup = 1; $catchup <= $numterms; $catchup++) { shift }
} ;
$i++;
shift;
}
# sanitize the default settings
$sleepdur = 5 unless defined($sleepdur);
if ( ($breadth == "") && ($depth == "") ) {$breadth="1"};
if ($debug >=4) {
print "===========================\n";
print ("\$start is set to ",$start,"\n");
print ("\$maxdepth is set to ",$maxdepth,"\n");
print ("\$domain is set to ",$domain,"\n");
print ("\$host is set to ",$host,"\n");
print ("\$file is set to ",$file,"\n");
print ("\$grep is set to ",$grep,"\n");
print ("\$get is set to ",$get,"\n");
print ("\$url is set to ",$url,"\n");
print ("\$print is set to ",$print,"\n");
print ("\$depth is set to ",$depth,"\n");
print ("\$breadth is set to ",$breadth,"\n");
print ("\$exec is set to ",$exec,"\n");
print ("\$prerror is set to ",$prerror,"\n");
}
if ($error ne "" ) {print "\n",$error,"\n"};
#&url_exclude'make_excludelist;
#
#if ($get eq "1") {
# dbmopen(filestore,"filestore.dbm",0666) ||
# print "Could not open dbm file\n";
#}
#
# %filestore is a dbm array containing all -get snarfed files
# BUG: this code commented out 'cause dbm has a hard limit of 1024 bytes
# per record. This means the idea of stashing files one per record needs
# a rethink unless I can come up with a copy of perl that uses gdbm
#----------------------- from here on, it walks the web ---------------
$n=1;
# -start argument overrides -host
# may need to add "index.html" to end of -host name
$start_depth=1;
if (($start == "") && ($host != "")) {
$start = $host;
}
$prevurl = $start;
if ($print eq "2") {
print "ROOT: $start\n";
}
&putnext($start,$start_depth);
$num_items = @tour_list /2;
#---------------- Main loop
main_loop:
until ($num_items <= 0) {
($thisurl,$xdepth) = &getnext;
$num_items = @tour_list /2;
if ($debug >=2 ) { print "depth: $xdepth\nurl: $thisurl\n"; }
&do_break_actions;
undef $doc;
undef @bannedurls;
@bannedurls = &url_exclude'off_limits($thisurl, "websnarf");
undef $bannedflag;
if ($#bannedurls > 0) {
foreach $fragment (@bannedurls) {
chop $fragment;
$bannedflag = index($thisurl,$fragment);
}
}
if ($bannedflag > 0) {
if ($prerror eq "1") {
print "Bannedflag: $bannedflag\n";
print "EXCLUDED by NoRobots protocol: $thisurl \n";
}
next main_loop ;
} else {
sleep($sleepdur);
$doc= &url_get'url_get($thisurl) ;
if ($debug >=2) {
print "got: \$thisurl is $thisurl, size:", length($thisurl), "\n";
}
$total_visited++;
}
if (($doc eq "1") || ($doc =~ /404 Not Found/) || ($doc =~ /^40\d/)) {
if ($prerror eq "1") {
print "\nERROR: $thisurl returned\n---\n$doc\n---\n\n";
$urlerrors{$thisurl} = $doc;
}
next main_loop;
}
next main_loop if $doc eq '1';
next main_loop if $doc=~/Directory Index of/;
next main_loop if $doc=~/404 Not Found/;
$actresult = &do_actions; # this calls the test loop
resume:
if ( $xdepth <= $maxdepth) {
&addurls($doc, $thisurl, $xdepth);
}
$prevurl = $thisurl;
}
#---------------------- end of mainloop ----------------
if ($debug >=2 ) {
print "\n---\n\nRun finished!\n---\n\n";
print "post-mortem queue dump:\n\n---\n@tour_list\n\n";
print "Total push list: $totalpushed\n";
}
#if ($get eq "1") {
# &makeget_tree;
# dbmclose(%filestore);
# exec "rm filestore.dbm";
#}
-e $scratchfile && exec "rm $scratchfile" ;
# &url_exclude'write_excludelist;
print "\nFinished:\n\nvisited $total_visited pages on the web\n";
if ($print eq "3") {&prettyprint };
exit(0);
#----------- END OF MAIN PROGRAM -----------------------------------
#sub makeget_tree {
# while (($scanurl, $scanfile) = each %filestore) {
# ($mprotocol, $mhost, $mport, $mrest1, $mrest2, $mrest3) =
# &url_parse'parse_url($scanurl);
# if ($mprotocol eq "http") {
# $newurl = "/".$mhost.$mrest1.$mrest2.$mrest3;
# while (($transurl, $transfile) = each %filestore) {
# $filestore{$transurl} =~ s/$scanurl/$newurl/gi ;
# }
# }
# };
# # now write out the frobnicated files
# while (($scanurl, $scanfile) = each %filestore) {
# if (substr($scanurl,-1,1) eq "/") {
# $destname = "index.html";
# chop $scanurl;
# }
# undef @bits;
# @bits = split(/\//,$scanurl);
# $creatpath = ".";
# for ($i=0; $i<$#bits;$i++) {
# $creatpath = $creatpath."/".$bits[$i];
# print "creating $creatpath \n";
# mkdir($creatpath,777) || print "could not create $creatpath\n;" ;
# $chresult = chmod 0755, "$creatpath";
# print "chmod(); result $chresult on $creatpath\n";
# }
# $creatpath = $creatpath."/".$destname;
# print "creating $creatpath\n";
# open (DUMP, ">$creatpath") || print "could not open $creatpath\n";
# select DUMP; $| = 1;
# print DUMP $scanfile;
# close DUMP; $| = 0;
# }; # end of write loop
#};
#
#--------------- MAINLOOP EXIT CONDITION SCAN ------------------------
sub do_break_actions {
# this lot is questionable since I grafted in the url parse library
# not quite broken, but needs re-writing
# return if ($thisurl =~ /\d+/) ; # break if stack's gotten messy
($dprotocol, $dhost, $dport, $drest1, $drest2, $drest3) =
&url_parse'parse_url($thisurl);
if ($debug >=4 ) {
print "Doing break actions: \n";
print "thisurl: $thisurl\ndepth: ", $xdepth ,"\n";
}
# avoid infinite loops
@been_here_before = grep(/$thisurl/,@tour_list);
if ($#been_here_before > 10) { # we're stuck in a loop
@foo = grep(!/$thisurl/, @tour_list);
@tour_list = @foo;
next main_loop;
# we have now dumped all the repeating elements and reset the
# current URL
} # end of loop handler
# if we exceed the -depth argument, set nopush to true; else false
if ($xdepth >= $maxdepth){
$nopush = "true";
} else {
$nopush = "false";
}
# return if we've wandered outside the -domain (if -domain is set )
if ( ($debug >= 4) && ($domain ne "") ) {
print "domain limited to $domain: now in: $mysite\n" ;
};
# if ($#tour_list > 0) {
# next main_loop if (($domain ne "") && ($dhost !~ /$domain/i)) ;
# } else {
# exit(1) if (($domain ne "") && ($dhost !~ /$domain/i)) ;
# }
# return if we've wandered away from the -host (if -host is set )
if ($debug >= 4) {
print "host limited to $host: now in: $thisurl\n" ;
};
next main_loop if (($host ne "") && ($dhost !~ /$host/i)) ;
if ($debug >= 4) { print "Checked break actions -- no break\n";}
}
#--------------- PREDICATE MATCHING LOOP ----------------------------
# unverified
sub do_actions {
local($pres) = "0";
# this loop permits the program to do find(1) style predicate
# testing instead of normal (orthogonal) option handling -- if desired
# foreach $predicate (@predicate) {
# if ($predicate eq "-grep") { $pres = &do_grep };
# if ($predicate eq "-get") { $pres = &url_dump'do_get($debug, $thisurl, $doc) };
# if ($predicate eq "-print"){ $pres = &do_print };
# if ($predicate eq "-url") { $pres = &do_url };
# if ($predicate eq "-exec") { $pres = &do_exec };
# }
# goto resume ;
if ($grep ne "") { &do_grep };
if ($get ne "0") {
if ($debug >=2) { print "getting file \n";}
$pres = &url_dump'do_get($debug, $thisurl, $doc)
};
# if ($get ne "") { $pres = &do_get($debug, $thisurl, $doc) };
# if ($print ne "") { &do_print };
if ($exec ne "") { &do_exec };
return $pres;
}
#--------------- PREDICATE FUNCTIONS ------------------------------
# unverified
sub do_grep {
@hit=grep($grep, $file_text);
if ($#hit != 0) {
return 0
} else {
return 1
}
}
sub do_print {
# if ($print eq "1") {
# print("http://$mysite:$myport$mypath\n") && return 0;
# } else {
# if ($print eq "2") {
# print("$prevurl : $thisurl\n") && return 0;
# };
# }
return 1;
}
sub do_url {
if ( $url =~ $mysite) {
if ($url =~ $mypath) {
return 0
} else {
return 1
}
} else {
return 1;
}
}
sub do_exec {
$hit = 1;
$task= $exec;
if ($task =~ /$scratchfile/) {
open (SYSTST, ">$scratchfile"); $|=1;
print SYSTST $doc;
close SYSTST;
}
$urlname = $mysite.$mypath; # create a unique url name
@urlnames = split("/",$urlname);
$urlname = "";
foreach $bit (@urlnames) {
$urlname = $urlname."_".$bit;
}
$task =~ s/\[\]/$urlname/ ; # subst $urlname for [] in $exec
if ($task ne "0") {
$hit = system ($task) ;
}
return $hit/255;
-e $scratchfile && exec "rm $scratchfile" ;
}
#------------------ SUPPORT FUNCTIONS ------------------------------
sub listen {
local($secs) = @_;
local($return,$tmp) = "";
while (length($tmp = &chat'expect($secs, '(.|\n)+', '$&'))) {
print $tmp if $trace;
$return .= $tmp;
(return $return) if (length($return) > 100000);
}
$return;
}
# unverified
sub getnext {
local ($dep, $url, $tmp) = "";
local (@ret);
if ($depth == 1) { # depth-first == stackwise
$ret = pop(@tour_list);
if ($debug >=4) {print "popped ";}
} else { # breadth-first == queuewise
if ($breadth == 1) {
$ret = shift(@tour_list);
if ($debug >= 4) {print "shifted ";}
}
}
($url,$dep) = split(',',$ret);
if ($debug >=4) {print "($url,$dep)\n";}
if (($dep !~ /\d+/) ||
($url =~ /href=(\"*)([^\"]+(\"*)).*A>/gi) ) {
print "HORRID WEIRDNESS DETECTED IN &getnext(); KLUGING\n";
$tmp1 = $dep; $url = $dep; $dep = $tmp1;
};
return ($url,$dep);
}
sub putnext {
local(@item)=@_ ;
if ($#item != $[ + 1) {
print "UNHOLY EVIL DETECTED!!! Wrong number of args in putnext()\n";
}
$element = join(',',@item);
if ($debug >=4 ) {print "Pushing (url,depth) = ($element)\n";}
push(@tour_list, $element) || print "putnext(); PUSH FAILED \n";
$totalpushed++;
}
sub prettyprint {
local ($numentities, $prtdex, $indent, $leader, $i, $target) = 0;
local (@visitvec);
print "Probed $maxdepth levels into the web, using ";
if ($breadth == "1") {
print "breadth-first traversal\n";
} else {
print "depth-first traversal\n";
};
print "\n\nReport:\n\nURLs are listed, indented by levels beneath ";
print "their parents.\n";
print "Parents are indented by one arrow \"-->\" for each level down at\n";
print "which they were encountered.\n\n";
print "URLs surrounded by (brackets) were not retrieved by $progname.\n\n";
print "--" x 30;
print "\n\n";
foreach $prtdex (keys(%pretty)) {
undef @visitvec;
@visitvec = split(/\+/,$pretty{$prtdex});
$numentities = $#visitvec;
$indent = $visitvec[0];
$leader = "(".$visitvec[0].")";
$leader .= '-->' x ($indent - 2);
if (index($visitvec[1],"*") == $[) {
$visitvec[1] =~ s/\*//;
}
print "\n$leader$visitvec[1]\n";
$leader = ' ' x ($indent -1) ;
$leader .= "|--";
for ($i=2; $i<=$numentities;$i++) {
$target = $visitvec[$i];
if ($i == $numentities) {
$leader = ' ' x ($indent -1);
$leader .= "+--";
};
if (index($target,"*") == $[) {
$target =~ s/\*//;
print "$leader($target)\n";
} else {
if (defined $urlerrors{$target}) {
print "$leader$target *** ERROR ***\n";
} else {
print "$leader$target\n";
}
}
}
}
print "\n----\n\nEnd of report\n\n----\n";
}
#--------------------- abort routine -----------------------------------
sub abend {
print "ERROR: program terminated by signal!\n";
&prettyprint;
exit 1;
}
#--------------------- new, bulletproof stack-adding routine ------------
sub addurls {
local ($file, $prev_url, $xdepth) = @_ ;
local ($endflag) = "";
local ($foo) = "";
local ($previous) = $prev_url;
$file =~ s/\n//i;
$file =~ s/href/HREF/gi;
$file =~ s/\/a>/\/A>/gi;
if ($debug >=5) {print "addurls() called with \$previous:$previous\n";}
while ($endflag eq "") {
# iteratively get URLs and process them
#
$start=index($file,"HREF");
if ($start == -1) { $endflag = 1};
$end=index($file,"\A>");
if ($end == -1) { $endflag = 2};
$thisurl = substr($file,$start, ($end-$start)+2 );
$file = substr($file,$end+2, (length($file)-($end+2)));
$thisurl =~ s/\s//gi;
if ($thisurl =~ s/HREF=\"//) {
$tmp1 = substr($thisurl,$[, index($thisurl,'"'));
}
if ($thisurl =~ /HREF=[^"]/) {
$thisurl =~s/HREF=//;
$tmp1 = substr($thisurl,$[, index($thisurl,'>'));
}
# $tmp1 now contains a URL. Check to see if it's partial
#
$tmp1 =~ s/http/http/i;
$tmp1 =~ s/ftp/ftp/i;
$tmp1 =~ s/gopher/gopher/i;
$tmp1 =~ s/file/file/i;
$tmp1 =~ s/news/news/i;
$tmp1 =~ s/mailto/mailto/i;
$tmp1 =~ s/telnet/telnet/i;
$tmp1 =~ s/wais/wais/i;
#
if ($tmp1 !~ /((http)|(ftp)|(gopher)|(file)|(news)|(mailto)|(telnet)|(wais).+)/) {
# make absolute URL using $prev_url
#
undef @dirstack;
foreach $foo (split("/",$prev_url)) {
if ($foo ne "") {
push(@dirstack, $foo);
};
};
if (substr($prev_url,-1) ne "/") {
pop @dirstack;
}
@relative = split('/',$tmp1);
for ($i=$[; $i <= $#relative; $i++) {
next if (@relative[$i] eq "");
if (@relative[$i] =~ /\.\./) {
pop @dirstack;
} else {
push(@dirstack, @relative[$i]);
};
};
# dirstack now contains a stack of path elements
#
$tmp1 = '';
foreach $elem (@dirstack) {
$tmp1 = $tmp1.'/'.$elem;
};
$tmp1 = substr($tmp1,1,(length($tmp1)-1));
};
# $tmp1 now contains an absolute url. Lets do some post-prettifying
#
if ($tmp1 =~ /http/) {
if (substr($tmp1, -1) eq "/") {
$tmp1 = $tmp1.'index.html';
};
if (rindex($tmp1,'/') <= rindex($tmp1,'#')) {
# we don't want no steeking local refs!!
#
$tmp1 = substr($tmp1, $[, (rindex($tmp1,'#')));
};
}
$firstslash = index($tmp1,'/');
if (substr($tmp1, ++$firstslash,1) ne '/') {
$tmp1 = substr($tmp1,$[,$firstslash).'/'.substr($tmp1,$firstslash);
}
# ignore if it's a weird file
next if $tmp1 =~ /\.mp(e)*g/i ;
next if $tmp1 =~ /\.zip/i ;
next if $tmp1 =~ /\.tar/i ;
next if $tmp1 =~ /\.hqx/i ;
next if $tmp1 =~ /\.ps/i ;
next if $tmp1 =~ /\.gz/i ;
next if $tmp1 =~ /\.bin/i ;
# log, but don't push, if it's not readable
if ($tmp1 =~ /wais:/i) {$nopush = "true"};
if ($tmp1 =~ /news:/i) {$nopush = "true"};
if ($tmp1 =~ /telnet:/i) {$nopush = "true"};
if ($tmp1 =~ /.*\?.*=.*/i) {$nopush = "true"};
if ($tmp1 =~ /.*\?.*/i) {$nopush = "true"};
# don't bother retreiving wais, news, telnet, or query spaces
if ($domain ne "") {
if ($tmp1 !~ /$domain/i) { $nopush = "true" };
}
# don't visit URLs that lie outside a -domain argument
if ($print eq "1") {
print("$tmp1\n") ;
} else {
if ($print eq "2") {
if ($nopush ne "true") {
print("$xdepth:$previous --> $tmp1\n") ;
} else {
print("$xdepth:$previous --> $tmp1 (not dereferenced)\n");
}
};
}
if ($print eq "3") {
print "+";
if ($pretty{$previous} eq "") {
$pretty{$previous} = $xdepth;
}
#if (($pretty{$previous} =~ /$tmp1/) == 1 ) {
#print "found $tmp1 in \$pretty\{$previous\}\n";
if ($nopush ne "true") {
$pretty{$previous} .= "+".$tmp1;
} else {
if ($nopush eq "true") {
$pretty{$previous} .= "+"."*".$tmp1;
}
}
#}
} # %pretty is indexed on URLs. Each entry contains a "+"-separated
# string of values, starting with the depth at which the URL was
# first encountered, followed by all its children; children which
# will not be visited are prefixed with an asterisk. Note that a
# given URL should not appear in the list more than once.
if ($nopush eq "false") {
&putnext($tmp1,($xdepth+1));
# $prevurl = $tmp1;
$num_items = @tour_list /2;
}
};
}