#!/usr/bin/perl -w # $Id: tr.pl,v 1.5 2009/11/07 16:51:06 asdrury Exp $ # This script uses $WORDS_HOME and $DOMX_HOME # See: http://www.xml.com/lpt/a/873 # HOWTO: # $ cat gmr.xml | $0 > domx.xml use strict; use XML::LibXML; use IO::Handle; use IO::File; use Sys::Hostname; use Getopt::Std; binmode STDOUT, ":utf8"; binmode STDERR, ":utf8"; ############################################################################### # defaults ############################################################################### defined $ENV{'DOMX_HOME'} or die "\$DOMX_HOME not defined. See setenv.sh" ; my $DEFAULT_STOPWORDS_FILE_ = "$ENV{'DOMX_HOME'}/xml/frequent_words.xml" ; my $DEFAULT_SOURCE_FILE_ = "$ENV{'DOMX_HOME'}/xml/gmr.xml" ; my $DEFAULT_FORMAT_ = "plain_text"; my $DEFAULT_TITLE_ = "Dictionary and Grammar of Ordinary of Mass, Extraordinary Form" ; my $SPC1 = " "; my $SPC2 = " "; my $SPC3 = " "; ############################################################################### # usage ############################################################################### my $USAGE_=< domx.xml F: Output format: plain_text, HTML, XML; default = ${DEFAULT_FORMAT_} t: Document title; default = ${DEFAULT_TITLE_} f: location of frequent words file; default = ${DEFAULT_STOPWORDS_FILE_} v: verbose h: this message EOM ############################################################################### # getopts ############################################################################### my %opts ; getopts('F:t:f:vh', \%opts) ; print $USAGE_ and exit 0 if (defined $opts{'h'}); my $format_ = ((defined $opts{'F'}) ? $opts{'F'} : $DEFAULT_FORMAT_) ; my $title_ = ((defined $opts{'t'}) ? $opts{'t'} : $DEFAULT_TITLE_) ; my $freqs_file_ = ((defined $opts{'f'}) ? $opts{'f'} : $DEFAULT_STOPWORDS_FILE_) ; my $verbose_ = ((defined $opts{'v'}) ? 1 : 0) ; ############################################################################### # data structures ############################################################################### my %entries = (); # key=Term+Grammar; val=definition+grammar+@cites my %citations = (); # key=Term+Grammar; @vals=name+id+la+(en) my %translations = (); # key=Term+Grammar; @vals=translation ############################################################################### # vars ############################################################################### my $term_counter = 0; ############################################################################### # initialize the XML parser ############################################################################### my $parser = new XML::LibXML; ############################################################################### # These words appear so often that we don't want to list citations for them ############################################################################### my @frequent_words_ = &read_stop_words() ; my %is_too_frequent_to_cite_ = () ; for (@frequent_words_) { $is_too_frequent_to_cite_{$_} = 1 ; } ############################################################################### # Timestamp and hostname for ############################################################################### my ($date, $time) = &time_stamp() ; my $login = getlogin || getpwuid($<) || "Kilroy"; my $host = hostname(); my $who = "$login\@$host" ; ############################################################################### # Open filehandle, parse and transform GMR XML file ############################################################################### print STDERR "Parsing Ordo Missae XML data from STDIN...\n" if $verbose_ ; my $fh = new IO::Handle; if( $fh->fdopen( fileno( STDIN ), "r" )) { my $doc = $parser->parse_fh( $fh ); my $root = $doc->getDocumentElement; my @concisions = $root->getElementsByTagName('concisio'); print STDERR "\@concisions=" . @concisions . "\n" if $verbose_ ; foreach my $concisio (@concisions) { if ("prayer" eq $concisio->getAttribute("type")) { my $id = $concisio->getAttribute("id"); #print "$id " if $verbose_ ; my $name = $concisio->getAttribute("name"); #print "$name\n" if $verbose_ ; my @sententiae =$concisio->getElementsByTagName('sententia'); #print "\@sententiae=" . @sententiae . "\n" if $verbose_ ; my $latin = ""; my $english = ""; foreach my $sententia (@sententiae) { my $lang = $sententia->getAttribute("lang"); if ("la" eq "$lang") { $latin = $sententia->findvalue("text()") ; $latin =~ s/\n//g ; $latin =~ s/\s+/ /g ; $latin =~ s/^\s*// ; $latin =~ s/\s*$// ; $latin =~ s/,$// ; $latin =~ s/:$// ; $latin =~ s/\.$// ; $latin =~ s/\;$// ; #print STDERR "\$latin=\"$latin\"\n" if $verbose_ ; } elsif ("en" eq "$lang") { $english = $sententia->findvalue("text()") ; $english =~ s/\n//g ; $english =~ s/\s+/ /g ; $english =~ s/^\s*// ; $english =~ s/\s*$// ; $english =~ s/,$// ; $english =~ s/:$// ; $english =~ s/\.$// ; $english =~ s/\;$// ; #print STDERR "\$english=\"$english\"\n" if $verbose_ ; } elsif ("he" eq "$lang" or "el" eq "$lang") { # do nothing } else { print STDERR "unexpected lang=" . $sententia->getAttribute("lang") . "\n" ; } } my @verba = $concisio->getElementsByTagName('verbum'); #print "\@verba=" . @verba . "\n" if $verbose_ ; foreach my $verbum (@verba) { if ("la" eq $verbum->getAttribute('lang')) { ############################################################### # make $key and $id ############################################################### my $radix = $verbum->getAttribute('radix'); $radix =~ s/\.//g ; my $is_proper_name = $verbum->getAttribute('proper_name'); my $term = $verbum->findvalue('term/text()'); $term = lc $term if not $is_proper_name ; #print "" . $term . " " if $verbose_ ; if (not $is_too_frequent_to_cite_{$term}) { my $trns = $verbum->findvalue('translation/text()'); $trns = lc $trns if not $is_proper_name ; #print "" . $trans . " " if $verbose_ ; $trns =~ s/^i$/I/g ; # some ad hoc fixes $trns =~ s/i will/I will/g ; $trns =~ s/i acknowledge/I acknowledge/g ; $trns =~ s/ god/ God/g ; $trns =~ s/^o /O /g ; $trns =~ s/\s+/ /g ; $trns =~ s/^\s*// ; $trns =~ s/\s*$// ; my %grams = &grams_($verbum ) ; #print $grams . "\n" if $verbose_ ; my $key = lc "${radix}:" . $grams{'string'} ; my $cid = "${id}#${name}" ; ############################################################### # pull together all citations for $key ############################################################### my %cites ; %cites = @{ $citations{$key} } if (exists($citations{$key})) ; $cites{$cid} = "${latin} (${english})" ; #print "\$cites{$cid}=$cites{$cid}" . "\n" if $verbose_ ; #&dump_(%cites) if $verbose_ ; $citations{$key} = [ %cites ] ; ############################################################### # get all translations for $key ############################################################### my %trans ; %trans = @{ $translations{$key} } if (exists($translations{$key})) ; $trans{$trns} = "$trns" ; $translations{$key} = [ %trans ] ; ############################################################### # pull together all elements of a dictionary entry ############################################################### my %entry ; %entry = @{ $entries{$key} } if (exists($entries{$key})) ; $entry{'radix'} = $radix if (!exists($entry{'radix'})) ; $entry{'term'} = $term if (!exists($entry{'term'})) ; $entry{'grammar'} = [ %grams ] if (!exists($entry{'grammar'})) ; $entry{'translations'} = [ %trans ] ; $entry{'citations'} = [ %cites ] ; ############################################################### # store dictionary entry ############################################################### $entries{$key} = [ %entry ] ; } # $is_stop_word } # @lang=la } # $verbum } # @type=prayer } # $concisio } # while ############################################################################### ############################################################################### # dump transformed data ############################################################################### if ("plain_text" eq "$format_") { foreach my $key (sort keys %entries) { my %entry = @{ $entries{$key} }; my %grams = @{ $entry{'grammar'} }; print $entry{'term'} . " " . $grams{'string'} . "\n"; my %trans = @{ $entry{'translations'} }; foreach my $k (sort keys %trans) { print $trans{$k} . "; " ; } print "\n" ; my %cites = @{ $entry{'citations'} }; foreach my $cid (sort keys %cites) { #print STDERR "\$cites{$cid}=$cites{$cid}" . "\n" if $verbose_ ; my $text = $cites{$cid} ; my @citation = split /#/, $cid ; # id#prayer print $citation[1] . "," . $citation[0] . ": " . $text . "\n" ; } print "---------------\n" ; } } elsif ("XML" eq "$format_") { print "\n" ; print "\n"; print "\n" ; print "<![CDATA[$title_]]>\n" ; foreach my $key (sort keys %entries) { my %entry = @{ $entries{$key} }; my %grams = @{ $entry{'grammar'} }; print "\n" ; print "\n"; print $SPC1 . "\n" ; print $SPC1 . &to_xml_("grammar", \%grams ) . "\n" ; print $SPC1 . "\n"; my %trans = @{ $entry{'translations'} }; foreach my $k (sort keys %trans) { print $SPC2 . "\n"; } print $SPC1 . "\n"; print $SPC1 . "\n"; my %cites = @{ $entry{'citations'} }; #print "\%cites=" . @{ %cites } . "\n" if $verbose_ ; foreach my $cid (sort { &get_id_($a) <=> &get_id_($b) } keys %cites) { #print STDERR "\$cites{$cid}=$cites{$cid}" . "\n" if $verbose_ ; print $SPC2 . "\n"; my $text = $cites{$cid} ; my $cite = $cid ; # id#prayer my @citation = split /#/, $cite ; print $SPC3 . "" . "" . "\n" ; print $SPC2 . "\n"; } print $SPC1 . "\n"; print "\n"; } print "\n"; } ############################################################################### # extract grammatical info from ############################################################################### sub grams_() { my $verba = shift ; my %ignore_attribs = (lang => 1, radix => 1, object => 1, object_id => 1, verb => 1, modifies => 1, proper_name => 1); my %parts = (N => 'Noun', PRON => 'Pronoun', V => 'Verb', VT => 'Verb transitive', VSDEP => 'Semideponent verb', IDIOM => 'Idiom', VDEP => 'Deponent verb', VPAR => 'Verb participle', PREP => 'Preposition', ADJ => 'Adjective', ADV => 'Adverb', CONJ => 'Conjunction', VOC => 'Vocative', NUM => 'Number', INTERJ => 'Interjection'); my %ordinals = ('?' => 'Unknown', 0 => 'Zero', 1 => '1st', 2 => '2nd', 3 => '3rd', 4 => '4th', 5 => '5th', 6 => '6th', 7 => '7th', 8 => '8th', 9 => '9th', '1,2' => '1st, 2nd' ); my %tenses = (PRES => 'Present', PERF => 'Perfect', FUT => 'Future', FUTP => 'Future perfect', IMPF => 'Imperfect'); my %voices = (ACTIVE => 'Active', PASSIVE => 'Passive'); my %cases = ('?' => 'Unknown', NOM => 'Nominative', GEN => 'Genitive', VOC => 'Vocative', ABL => 'Ablative', DAT => 'Dative', ACC => 'Accusative', DEMONST => 'Demonstrative'); my %numbers = (S => 'Singular', P => 'Plural', X => 'No number'); my %genders = (M => 'Masculine', F => 'Feminine', N => 'Neuter', X => 'None', C => 'Common'); my %moods = (SUB => 'Subjunctive', IND => 'Indicative', IMP => 'Imperative', INF => 'Infinitive', PPL => 'Participle'); my %grams = (); my $gstr = "" ; my $nm = "" ; #foreach my $k (keys %ignore_attribs) {print STDERR "\$ignore_attribs{$k}=$ignore_attribs{$k}\n";} foreach my $attrib ($verba->getAttributes()) { $nm = $attrib->getName(); #print "\$nm=$nm \$ignore_attribs{$nm}=$ignore_attribs{$nm}\n" if $verbose_ ; if (not $ignore_attribs{$nm}) { my $val = $attrib->getData(); #print "\$val=$val\n" if $verbose_ ; if ($nm =~ m/part/) { $grams{$nm} = "Part: " . $parts{$val} ; print STDERR "\$parts{$val} not found\n" if (not defined $parts{$val} or not $parts{$val}) and $verbose_ ; } elsif ($nm =~ m/declension/) { $grams{$nm} = "Declension: " . $ordinals{$val} ; print STDERR "\$ordinals{$val} not found\n" if (not defined $ordinals{$val} or not $ordinals{$val}) and $verbose_ ; } elsif ($nm =~ m/conjugation/) { $grams{$nm} = "Conjugation: " . $ordinals{$val} ; print STDERR "\$ordinals{$val} not found\n" if (not defined $ordinals{$val} or not $ordinals{$val}) and $verbose_ ; } elsif ($nm =~ m/tense/) { $grams{$nm} = "Verb tense: " . $tenses{$val} ; print STDERR "\$tenses{$val} not found\n" if (not defined $tenses{$val} or not $tenses{$val}) and $verbose_ ; } elsif ($nm =~ m/case/) { $grams{$nm} = "Case: " . $cases{$val} ; print STDERR "\$cases{$val} not found\n" if (not defined $cases{$val} or not $cases{$val}) and $verbose_ ; } elsif ($nm =~ m/number/) { $grams{$nm} = "Number: " . $numbers{$val} ; print STDERR "\$numbers{$val} not found\n" if (not defined $numbers{$val} or not $numbers{$val}) and $verbose_ ; } elsif ($nm =~ m/gender/) { $grams{$nm} = "Gender: " . $genders{$val} ; print STDERR "\$genders{$val} not found\n" if (not defined $genders{$val} or not $genders{$val}) and $verbose_ ; } elsif ($nm =~ m/person/) { $grams{$nm} = "Person: " . $ordinals{$val} ; print STDERR "\$ordinals{$val} not found\n" if (not defined $ordinals{$val} or not $ordinals{$val}) and $verbose_ ; } elsif ($nm =~ m/voice/) { $grams{$nm} = "Voice: " . $voices{$val} ; print STDERR "\$voices{$val} not found\n" if (not defined $voices{$val} or not $voices{$val}) and $verbose_ ; } elsif ($nm =~ m/mood/) { $grams{$nm} = "Mood: " . $moods{$val} ; print STDERR "\$moods{$val} not found\n" if (not defined $moods{$val} or not $moods{$val}) and $verbose_ ; } $grams{$nm} = $attrib->getData() if not $grams{$nm} ; $gstr .= $grams{$nm} . "; " ; } } $gstr =~ s/\s+/ /g ; $gstr =~ s/^\s*// ; $gstr =~ s/\s*$// ; $grams{'string'} = $gstr ; return %grams ; } ############################################################################### # traverse hash to make XML ############################################################################### sub to_xml_() { my $element = shift ; my $href = shift ; my $xml = "<" . $element . ">" ; for my $key (keys %{ $href } ) { $xml .= "<" . $key . ">{$key} . "]]>" ; } $xml .= "" ; return $xml ; } ############################################################################### # Example: # # ($date,$time) = &time_stamp(); # # Results: # # $date = 2005-08-10 # $time = 08:15:32 ############################################################################### sub time_stamp { my ($d,$t); my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); $year += 1900; $mon++; $d = sprintf("%4d-%2.2d-%2.2d",$year,$mon,$mday); $t = sprintf("%2.2d:%2.2d:%2.2d",$hour,$min,$sec); return($d,$t); } ############################################################################### # Open and read file of words that are too frequent to cite ############################################################################### sub read_stop_words { my @stop_words = (); my $fh = new IO::File ; print STDERR "opening ${freqs_file_}...\n" if $verbose_ ; if ($fh->open("< ${freqs_file_}")) { my $doc = $parser->parse_fh( $fh ); my $root = $doc->getDocumentElement ; my @terms = $root->getElementsByTagName('term') ; print STDERR "\@terms.size=" . @terms . "\n" if $verbose_ ; foreach my $term (@terms) { #print STDERR "pushing " . $term->textContent . "\n" if $verbose_ ; push @stop_words, $term->textContent ; } $fh->close ; } else { die "cannot open: $!" ; } return @stop_words ; } ############################################################################### # get ID from "id#name" ############################################################################### sub get_id_() { my $key = shift ; my @pair = split /#/, $key ; return $pair[0] ; } ############################################################################### # dump ############################################################################### sub dump_() { my %h = shift ; foreach my $key (keys %h) { print STDERR "\$key=$key \$val=$h{$key}" . "\n" ; } }