#!/bin/perl -w ####################################################################### # # # This tool goes to a file tagged with the tstmt.dtd (used for the # # quran http://metalab.unc.edu/xml/examples/religion/quran/quran.xml # # extracts the title number from bktlong and bktshort elements and # # creates both a num attribute for the title and a num element # # inside the title element. # # 1. The Opening becomes # # 1The Opening # # # ####################################################################### use strict; use XML::Twig; my $twig= new XML::Twig( TwigRoots => # build twig only for titles { bktlong => 1 , bktshort => 1, }, TwigPrintOutsideRoots => 1, # print the rest TwigHandlers => # set title handlers { bktlong => \&process_title, bktshort => \&process_title, }, KeepEncoding => 1, # keep original encoding ); if( $ARGV[0]) { $twig->parsefile( $ARGV[0]); } # process the twig else { $twig->parse( \*STDIN); } exit; sub process_title # title handler { my( $twig, $title)= @_; my $title_pcdata= $title->first_child( PCDATA); # get first text element my $title_text= $title_pcdata->pcdata; # get it's data my $title_no; ($title_no, $title_text)= # separate num and the ( $title_text=~ /\A(\d+)\.?\s*(.*)\Z/); # rest of the title $title->set_att( num => $title_no); # create attribute numm my $num= new XML::Twig::Elt( 'num', $title_no); # create element num $num->paste( 'first_child', $title); # paste element num $title_pcdata->set_pcdata( $title_text); # set title new text $title->print; # output the title }