#!/bin/perl -w ####################################################################### # # # This tool goes to a file tagged with the tstmt.dtd (used for the # # quran http://metalab.unc.edu/xml/examples/religion/quran/quran.xml # # extracts the title number from bktlong and bktshort elements and # # creates both a num attribute for the title and a num element # # inside the title element. # # 1. The Opening becomes # # 1The Opening # # # # Note: this version outputs the text in UTF-8 # # # ####################################################################### use strict; use XML::Parser; my $in_title=0; my $parser= new XML::Parser( Style => 'Stream'); if( $ARGV[0]) { $parser->parsefile( $ARGV[0]); } # parse the file else { $parser->parse( \*STDIN); } exit; sub StartTag # called for all star tags { my( $p, $gi, %att)= @_; if( ($gi eq 'bktlong') || ($gi eq 'bktshort')) { print "<$gi "; # will be closed in Text $in_title=1; # triggers Text processing } else { print $p->recognized_string(); } # else just print } sub Text # called for each string { if( $in_title) # if in title { my ($title_no, $title_text)= # separate the num from (/\A(\d+)\.?\s*(.*)\Z/); # the rest of the text print "num=\"$title_no\">"; # close the title tag print "$title_no$title_text"; # print the num alement $in_title=0; # not in title anymore } else { print ; } # else just print }