#!/bin/perl -w
#######################################################################
# #
# This tool goes to a file tagged with the tstmt.dtd (used for the #
# quran http://metalab.unc.edu/xml/examples/religion/quran/quran.xml #
# extracts the title number from bktlong and bktshort elements and #
# creates both a num attribute for the title and a num element #
# inside the title element. #
# 1. The Opening becomes #
# 1The Opening #
# #
#######################################################################
use strict;
use XML::Twig;
my $twig= new XML::Twig(
TwigRoots => # build twig only for titles
{ bktlong => 1 ,
bktshort => 1,
},
TwigPrintOutsideRoots => 1, # print the rest
TwigHandlers => # set title handlers
{ bktlong => \&process_title,
bktshort => \&process_title,
},
KeepEncoding => 1, # keep original encoding
);
if( $ARGV[0]) { $twig->parsefile( $ARGV[0]); } # process the twig
else { $twig->parse( \*STDIN); }
exit;
sub process_title # title handler
{ my( $twig, $title)= @_;
my $title_pcdata= $title->first_child( PCDATA); # get first text element
my $title_text= $title_pcdata->pcdata; # get it's data
my $title_no;
($title_no, $title_text)= # separate num and the
( $title_text=~ /\A(\d+)\.?\s*(.*)\Z/); # rest of the title
$title->set_att( num => $title_no); # create attribute numm
my $num= new XML::Twig::Elt( 'num', $title_no); # create element num
$num->paste( 'first_child', $title); # paste element num
$title_pcdata->set_pcdata( $title_text); # set title new text
$title->print; # output the title
}