#!/usr/bin/perl
use LWP::Simple;
use HTML::Parse;
use HTML::Entities;
use Text::Wrap;
use Getopt::Long;

use strict;
require 5.0;

=pod

=head 1 hp -- HTML prettyprinter

Online documentation will go here

=cut

# initialize global variables

%MAIN::breaktag = ( # list of tags to automatically insert line breaks around 
              "p"      => 1,
              "meta"   => 1,
              "title"  => 1,
              "head"   => 1,
              "body"   => 1,
              "h1"     => 1,
              "h2"     => 1,
              "h3"     => 1,
              "h4"     => 1,
              "h5"     => 1,
              "h6"     => 1,
              "pre"    => 1,
              "table"  => 1,
              "tr"     => 1,
              "td"     => 1,
              "center" => 1,
              "br"     => 1,
              "dl"     => 1,
              "dt"     => 1,
              "dd"     => 1,
              "li"     => 1,
              "al"     => 1,
              "set-var"   => 1,
              "get-var"   => 1,
              "include"   => 1,
            );

%MAIN::junkclosure = ( # list of tags for which we want to drop the close
                       # tag; flagged as IMPLICIT inthe DTD
              "p"      => 1,
              "hr"     => 1,
              "img"    => 1,
              "meta"   => 1,
              "set-var"   => 1,
              "get-var"   => 1,
              "include"   => 1,
           );

$MAIN::tmpbuff = ""; # global scratch workspace for accumulating text stream

# initialize variables local to main loop 

my ($tabspace) = 3;  # default num. of columns per tabstop
my ($wrapcols) = 70; # default page width for line wrap
my ($implicit) = 1;  # flag: insert implicit tags, or not
my ($ignore)   = 0;  # flag: ignore unknown tags, or not
my ($help)     = 0;  # do we want help?
my ($target)   = ""; # name of current file to process

# set up command line options

GetOptions("w=i"       => \$wrapcols,
           "wrap=i"    => \$wrapcols,
           "t=i"       => \$tabspace,
           "tab=i"     => \$tabspace,
           "implicit!" => \$implicit,
           "ignore!"   => \$ignore,
           "h"         => \$help,
           "help"      => \$help);

# issue help message, if needed

if (($help > 0)|| ($ARGV[0] eq"")) {
   print "\nhppp -- html prettyprinter\n",
         "reformat an HTML document to be more easily maintainable.\n",
         "\nOptions:\n",
         "-w<n>, --wrap=<n>     wrap document to <n> columns\n",
         "                      default: [$wrapcols]\n",
         "-t<n>, --tab=<n>      indent <n> characters per tag level\n",
         "                      default: [$tabspace]\n",
         "--implicit, --noimplicit\n",
         "                      specify insertion of implicit close tags\n",
         "                      (where missing) \n",
         "                      default: [on]\n",
         "--ignore, --noignore\n",
         "                      ignore or don't ignore unknown tags\n",
         "                      default: [on]\n",
         "\nTypical usage:\n",
         "hp -w 80 -t 2 index.html.old >index.html\n\n";
   exit 0;  
}

# foreach specified file, prettyprint it!

foreach $target (@ARGV) {
    $HTML::Parse::IGNORE_UNKNOWN = $ignore;
    my ($p) = HTML::Parser->new;
    $p->netscape_buggy_comment(1);
    $HTML::Parse::IMPLICIT_TAGS = $implicit;
    $p = parse_htmlfile($target);
    print "<html>";
    treewalk($p, 0, $tabspace, $wrapcols);
    print "\n</html>\n";
}

exit 0;

#------------------ support subroutine --------------------------
sub treewalk {
    my ($p) = shift @_;
    my ($depth) = shift @_;
    my ($tabspace) = shift @_;
    my ($wrap) = shift @_;
    my ($entity) = "";
    my ($indent) = 0;
    my ($bdindent) = 0;
    my ($outname) = "";
    my ($out) = "";
    if (ref($p->content)) {
        foreach $entity (@{ $p->content } ) {
            if (ref($entity) eq "HTML::Element") {
                $indent =  " " x ($depth * $tabspace);
                $bdindent = $indent . (" " x $tabspace);
                $out  = lc($entity->starttag);
                $out  =~    /^\S*?<       # start of line to start of tag
                              ([-\w]+)    # word contents of tag
                              \W+         # something that ain't a word
                            /x;
                $outname = $1;
                if (defined $MAIN::breaktag{$outname} ) {
                    if ($MAIN::tmpbuff !~ /^[\s\n]+$/)  {
                        $Text::Wrap::columns = ($wrap - $depth);
                        print "\n", 
                              (wrap($bdindent, 
                                    "$bdindent ", 
                                    ucfirst($MAIN::tmpbuff)));
                    }
                    print wrap("\n$indent", 
                               $indent, HTML::Entities::decode($out));
                    $MAIN::tmpbuff = "";
                } else {
                    $MAIN::tmpbuff .= HTML::Entities::decode($out);
                }
                # let's dive down the tree, now ...
                treewalk($entity, $depth+1, $tabspace, $wrap);
                # and it's time to close out whatever tag we were in before 
                if ($MAIN::breaktag{$outname} == 1) {
                    if (($MAIN::tmpbuff !~ /^[\s\n]+$/) && 
                        (length($MAIN::tmpbuff) > 0)) {
                        $Text::Wrap::columns = ($wrap - $depth);
                        print "\n", 
                              (wrap($bdindent, 
                                    "$bdindent ", 
                                    ucfirst($MAIN::tmpbuff)));
                    }
                    $MAIN::tmpbuff = "";
                    if ( ! defined ($MAIN::junkclosure{$outname})) {
                        print ("\n", $indent, lc($entity->endtag));
                    }
                } else {
                    if ($MAIN::tmpbuff !~ />$/) {
                        $MAIN::tmpbuff = substr($MAIN::tmpbuff, 0, -1);
                    }
                    if ( ! defined ($MAIN::junkclosure{$outname})) {
                        $MAIN::tmpbuff .= lc($entity->endtag) ;
                    }
                }
            } elsif (! ref($entity)) {
                if ($entity ne "") {
                    $MAIN::tmpbuff .=  $entity; 
                    $MAIN::tmpbuff .= " ";
                }
            }
        }
    }
}

#------------------------- end treewalk --------------------------
