6.  Beautifiers For SGML and XML

To beautify SGML and XML use one of these perl scripts. I used perl script from Kevin and it works fine for me.

6.1.  SGML Auto-Indenter By Kevin

This script was originally written by Kevin M. Dunn kdunn@hsc.edu Department of Chemistry Hampden-Sydney College HSC, VA 23943 (804) 223-6181 (804) 223-6374 (Fax). And this script here was modified and enhanced by Al Dev alavoor[AT]yahoo.com.

Several people have discussed the use of Tidy to indent sgml and xml sources, but does not work for SGML documents, as Tidy did not recognize the entities. Rather than fix Tidy, here is the perl script to indent anything with sgml-type tags. Only non-empty tags are indented, and text is justified at 80 characters/line (easily changed).

Known problems: will break line-specific enviroments. So far, the script is quite general--it does not recognize specific tags and so could be used for any xml or sgml, not just docbook. Is there any way to recognize literal text independent of DTD? Leading whitespace, for example? Trailing whitespace? Or I could indent tags only, and leave all non-tag text unjustified and unindented.

#!/usr/bin/perl -w
#
# sb: the sgml beautifier
# indents non-empty sgml tags
# usage: sb filename or sb < filename or | sb
# author: Kevin M. Dunn (kdunn@hsc.edu), Modified by Al Dev (alavoor[AT]yahoo.com)
# license: anyone is free to use this for any purpose whatever
#
use strict;
use diagnostics;
	  
sub separate_tags 
{
	@_ < 1 ?  die "\nInsufficient args .. " : 0 ;
	my ($tmpfile) = @_;
	my ($current_line);
	open(FILETMP, ">$tmpfile");
	while (<>)
	{
		$current_line = $_;
		#if ($current_line =~ /^\s+$/)
		if ($current_line eq "\n")
		{
			# Pad spaces to distinguish/identify this line with other newlines
			# so that this line is printed and not bypassed in indent_tags()
			$current_line = "\t  " . $current_line;  # Prepend with spaces
			#print "\ndone padding\n";
			#sleep 5;
		}
		#$_ =~ s/^\s+//;  # Left trim the leading white spaces - ltrim
		#$_ =~ s/\s+$//;  # Right trim the trailing white spaces - rtrim
		$current_line =~ s/</\n</g;  # Put newline before start of tag "<"
		$current_line =~ s/>/>\n/g;  # Put newline after end of tag ">"
		print FILETMP "$current_line";
	}
	close(FILETMP);
}
	  
sub get_tags 
{
	@_ < 1 ?  die "\nInsufficient args .. " : 0 ;
	my ($tmpfile) = @_;
	open(FILETMP, "$tmpfile");
	my ($word);
	while (<FILETMP>)
	{
		$word = $_;
		$word =~ s/[> ].*//; 
		chomp($word);
		if ( $word =~ /^<\/.*/ )
		{
			$sgb::tag2{$word} = 1; # here the word has something like '</TITLE'
			$word =~ s/\///;  
			$sgb::tag1{$word} = 1; # here the word has something like '<TITLE'
		}
	}
}
	  
sub indent_tags 
{
	@_ < 1 ?  die "\nInsufficient args .. " : 0 ;
	my ($tmpfile) = @_;
	my $jl = 80; #text will be justified to 80 characters/line
	my $nl = 0;
	my $sp = 0;
	my @space;
	$space[0] = "";
	  
	my $newline = ""; # hack to prevent extraneous blank first line
	  
	open(FILETMP, "$tmpfile");
	my ($current_line, $word, $saveword);
	while (<FILETMP>)
	{
		chomp($_); # avoid \n on last field
		$current_line = $_;
		$word = $current_line;
		$word =~ s/[> ].*//;  # truncate trailing "> " and spaces therafter
		if ( $sgb::tag1{$word} )
		{
			$saveword = $word;
			print "\n$space[$sp]$current_line";
			$nl = $jl; # force new line on next line of input
			$sp++;
			if ( ! $space[$sp] )
			{
				$space[$sp] = $space[$sp-1] . "  ";
			}
		}
		elsif ( $sgb::tag2{$word} )
		{
			$saveword = $word;
			$sp--;
			# If the tag is <ProgramListing> then do not justify...
			if (lc($word) eq "</programlisting")
			{
				print "$current_line";
			}
			else
			{
				print "\n$space[$sp]$current_line";
			}
			$nl = $jl; # force new line on next line of input
		}
		elsif ( $word =~ /<.*/ ) 
		{
			$saveword = $word;
			print "$newline$space[$sp]$current_line";
			$newline = "\n"; # hack to prevent extraneous blank first line
			$nl = $jl; # force new line on next line of input
		}
		elsif ( length($current_line) > 0 ) 
		{
			# If the tag is <ProgramListing> then do not justify...
			if (lc($saveword) eq "<programlisting")
			{
				#print "\nthe tag1 word is $saveword----eof \n";
				#print "$newline$space[$sp]$current_line";
				# DO NOT put any tabs or spaces, because repeated running of this program
				# on same file will keep putting tabs or spaces.
				print "$newline$current_line";  
				$newline = "\n"; # hack to prevent extraneous blank first line
				$nl = $jl; # force new line on next line of input
			}
			else
			{
				$nl = justify($jl, $nl, $sp, $current_line, @space);
			}
		}
	}
}
	  
sub justify 
{
	@_ < 4 ?  die "\nInsufficient args .. " : 0 ;
	my ($jl, $nl, $sp, $current_line, @space) = @_;
	  
	my @words = split;
	my $nw = @words;
	for (my $i = 0; $i < $nw; $i++ )
	{
		$sgb::ll += length($words[$i]) + 1 + $nl; # line length if this word is added
		if ($sgb::ll < $jl) # if short enough, print it
		{ 
			print "$words[$i] ";
			$nl = 0;
		}
		else # if line is too long, start a new one
		{ 
			print "\n$space[$sp]$words[$i] ";
			$nl = 0;
			$sgb::ll = length($space[$sp] . $words[$i]) + 1;
		}
	}
	return $nl;
}
	  
$sgb::ll = 0; # global var
my $tmpfile = "$$.tmp";
separate_tags($tmpfile);
get_tags($tmpfile);
indent_tags($tmpfile);
unlink ("$tmpfile"); # remove temporary file
print "\n"; # add final line to output
	  
        

6.2.  SGML Auto-Indenter By Hector

Download from "http://www.olea.org/tmp/indent-sgml-xml" . The author is at hector@debian.org And this script here was modified and enhanced by Al Dev alavoor[AT]yahoo.com.

The program below uses the XML::Parser. Read the online manual page with 'man XML::Parser::Expat' and also 'man XML::Parser'.

#!/usr/bin/perl -w 
#
	  
# Author: Hector (hector@debian.org). Modified by Al Dev (alavoor[AT]yahoo.com)
	  
# For documentation please see 'man XML::Parser::Expat' and 
# also see 'man XML::Parser'
	  
use diagnostics;
use XML::Parser::Expat;
	  
$|=1;
	  
if ( !$ARGV[0] ) 
{
       print "Argument missing\n";
       exit 1;
}
	  
$inline_tags = "acronym|ulink|link|citetitle|firstname|surname|application|guimenu|guisubmenu|guimenuitem|menuchoice|interface|guilabel|guibutton|glossterm|systemitem|filename|xref|emphasis|keycap|markup|email|command|inlinegraphic|entry|email|screeninfo|graphic";
$one_line = "title|member";
	  
$todo = "";
$temp = "";
$ancho = "  ";
$indentacion = 0;
	  
#open IN , "<$ARGV[0]";
#my $todo = join ('', <IN>);
#close (IN);
	  
$parser = new XML::Parser::Expat;
$parser->setHandlers('Start'   => \&inicio,
                     'End'     => \&fin,
		     		 'Char'    => \&cadena,
	             	 'Comment' => \&comentario);
	  
open(FOO, "$ARGV[0]") or die "Couldn't open";
	  
# If you get this type of error:
# syntax error at line 1, column 0, byte 0 at ../sgml-beautifier-indentar.pl line 37
# Then edit input file $ARGV[0] and change put this line -
# 	<?xml version="1.0" encoding="utf-8"?>
$parser->parse(*FOO);
close(FOO);
	  
	  
$todo =~ s/\n+/\n/gm;
$todo =~ s/\n *\n/\n/gm;
print "$todo\n";
	  
exit 1;
	  
sub inicio
{
	my ($p, $el, %atts) = @_;
	my $tag = "<$el";
	foreach my $key ( sort %atts)
	{
		if ( $atts{$key} )
		{
			$tag .= " $key=\"$atts{$key}\"";
		}
	}
	$tag .= ">";
	  
	if ( !($el =~ /$inline_tags|$one_line/) )
	{
		$temp = &indentar ($temp, $indentacion);
		if ( $temp ) {
			$todo .= "$temp\n";
		}
		my $pad = $ancho x $indentacion;
		$todo .= "$pad$tag\n";
		$temp = "";
		$indentacion++;
	}
	else
	{
		$temp .= $tag;
	}
}
	  
sub fin
{
	my ($p, $el) = @_;
	my $tag = "</$el>";
	if ( !($el =~ /$inline_tags/) )
	{
		$temp = &indentar ($temp, $indentacion);
		$temp =~ s/\n$// ;
		$todo .= "$temp";
		if ( !($el =~ /$one_line/) )
		{
			$indentacion--;
			if ( !($todo =~ /\n$/) ) {
				$todo .= "\n";
			}
			my $pad = $ancho x $indentacion;
			$todo .= "$pad";
			#$indentacion--;
		}
		$todo .= "$tag\n";
		$temp = "";
		#		$indentacion++;
	}
	else
	{
		$temp .= "$tag";
	}
}
	  
sub cadena
{
	my ($p, $str) = @_;
	$str =~ s/ +/ /g;
	#$str =~ s/^ //;
	#$str =~ s/ $//;
	$temp .= "$str";
}
	  
sub comentario
{
	my ($p, $str) = @_;
	$todo .= "<!--\n $str \n-->\n";
}
		
																	    
sub indentar ()
{
	my $linea = $_[0] ;
	#	print ("Indentacion es $_[1]\nLinea $_[0]\n");
	my $indentacion = $_[1];
	my $cantidad = 75 - ( $indentacion * length($ancho));
	my $pad = $ancho x $indentacion;
	
	my $temp = &cortar_linea ( $linea, $cantidad);
	$temp =~ s/\n/\n$pad/g;
	$temp =~ s/^ //;
	my $resultado = "$pad$temp\n";
	return $resultado;
}
	  
sub cortar_linea ()
{
	my $linea = $_[0];
	#$linea =~ s/\n/ \n/;
	$linea .= " ";
	my $cantidad = $_[1];
	$temp = "";
	$temp2 = "";
	#print "Llega $linea\n";
	while ( $linea =~ /(.+?) / )
	{
		if ( (length ($temp) + length ($+)) <= $cantidad )
		{
			$temp .= "$+ ";
			$linea = $';
		}
		elsif ( length ($+) >= $cantidad ) 
		{
			$linea = $';
			$temp2 .= "$temp\n$+";
			$temp = "";
		}
		else
		{
			$temp2 .= "$temp\n";
			$temp = "$+ ";
			$linea = $';
		}
	}
	$temp2 .= "$temp\n";
	$temp2 =~ s/\n$//;
	$temp2 =~ s/ $//;
	#	print "Sale\n##$temp2##\n";
	return $temp2;
}