#!/bin/perl -w
#######################################################################
# #
# This tool goes to a file tagged with the tstmt.dtd (used for the #
# quran http://metalab.unc.edu/xml/examples/religion/quran/quran.xml #
# extracts the title number from bktlong and bktshort elements and #
# creates both a num attribute for the title and a num element #
# inside the title element. #
# 1. The Opening becomes #
# 1The Opening #
# #
# Note: this version outputs the text in UTF-8 #
# #
#######################################################################
use strict;
use XML::Parser;
my $in_title=0;
my $parser= new XML::Parser( Style => 'Stream');
if( $ARGV[0]) { $parser->parsefile( $ARGV[0]); } # parse the file
else { $parser->parse( \*STDIN); }
exit;
sub StartTag # called for all star tags { my( $p, $gi, %att)= @_;
if( ($gi eq 'bktlong') || ($gi eq 'bktshort'))
{ print "<$gi "; # will be closed in Text
$in_title=1; # triggers Text processing
}
else
{ print $p->recognized_string(); } # else just print
}
sub Text # called for each string
{ if( $in_title) # if in title
{ my ($title_no, $title_text)= # separate the num from
(/\A(\d+)\.?\s*(.*)\Z/); # the rest of the text
print "num=\"$title_no\">"; # close the title tag
print "$title_no$title_text"; # print the num alement
$in_title=0; # not in title anymore
}
else
{ print ; } # else just print
}