#!/usr/bin/env python
# $Id: extract_gdd_notes.py,v 1.2 2012/10/21 18:49:55 asdrury Exp $
# http://www.perseus.tufts.edu/hopper/text?doc=Perseus:text:1999.04.0017
# Linux:
#  $ export LANG=en_US.UTF-8
#  [perseus 502]$ python --version
#  Python 3.2.3
# Windows:
# > set PYTHONIOENCODING=utf-8

################################################################################################################################
__author__ = "A. S. Drury"
__version__ = "$Revision: 1.2 $"
__date__ = "$Date: 2012/10/21 18:49:55 $"
################################################################################################################################

################################################################################################################################
import sys, re
from lxml import etree
from optparse import OptionParser
################################################################################################################################

################################################################################################################################
parser = OptionParser(usage="%prog -b xml [-h]", version="$Id: extract_gdd_notes.py,v 1.2 2012/10/21 18:49:55 asdrury Exp $")
parser.add_option("-b", "--bfname", dest="bfname", type="string",
action="store", default="b.xml", help="Second XML file.  Default=b.xml")
(options, args) = parser.parse_args()
################################################################################################################################

################################################################################################################################
# http://www.python.org/peps/pep-0263.html for details
# http://docs.python.org/howto/unicode.html
# http://www.joelonsoftware.com/articles/Unicode.html
# The rules for translating a Unicode string into a sequence of bytes are called an encoding.
#u = "é".encode("utf-8")
#u = str("é", errors="strict"); print(u)
################################################################################################################################

################################################################################################################################
latins = {'1':'I', '2':'II', '3':'III', '4':'IV', '5':'V', '6':'VI', '7':'VII', '8':'VIII'}
def latinize(n):
    return latins[str(n)]
################################################################################################################################

################################################################################################################################
def normalize(s, tag=None):
    r = s
    if s is not None:
        r = re.sub('\s+',' ',s.strip())
        r = re.sub('&','\&amp;',r)
    else:
        r = ''
    if tag is not None and '' != r:
        r = '<' + str(tag) + '>' + r + '</' + str(tag) + '>'
    return r
################################################################################################################################

################################################################################################################################
def get_text(e, tag=None):
    r = ''
    if e is not None and e.text is not None and '' != e.text.strip():
        r = e.text
    if e is not None and e.tail is not None and '' != e.tail.strip():
        r = r +' '+ e.tail
    return normalize(r, tag)
################################################################################################################################

################################################################################################################################
def p(s):
    if s is not None and '' != s.strip():
        print(s.strip())
################################################################################################################################

################################################################################################################################
def cat(note, s, html=None):
    t = note.text if note.text is not None else ''
    t = t + ' '
    t = t + normalize(s)
    t = t.strip()
    note.text = t
    return note
################################################################################################################################

################################################################################################################################
parser = etree.XMLParser(remove_blank_text=True)
bfile = open(options.bfname, encoding="utf-8", mode="rt")
btree = etree.parse(bfile)
belements = btree.xpath("//div1")
n = None
cap = None
liber = None
note = None
i = 0
print('<?xml version="1.0" encoding="utf-8"?>')
print('<notes>')
for b in belements:
    if 'div1' == b.tag and 'Book' == b.get('type'):
        liber = latinize(b.get('n'))
    children = b.getchildren()
    for child in children:
        if 'div2' == child.tag and 'Chapter' == child.get('type'):
            cap = child.get('n')
            gchildren = child.getchildren()
            for gchild in gchildren: 
                if 'milestone' == gchild.tag:
                    if gchild.get('unit') is not None and 'smythp' == gchild.get('unit') and note is not None:
                        # Print previously constructed note
                        print(etree.tostring(note, pretty_print=True).decode("utf-8"))
                    if gchild.get('n') is not None:
                        # Get n of new note
                        n = gchild.get('n')
                        # Increment i == local ID for next note
                        i = i + 1
                    if gchild.get('unit') is not None and 'smythp' == gchild.get('unit'):
                        # Start new note
                        note = etree.Element("note", liber=str(liber), capitulum=str(cap), n=str(n), id=str(i))
                    if gchild.get('unit') is not None and 'para' == gchild.get('unit'):
                        # Print text
                        note = cat(note, get_text(gchild))
                if 'lemma' == gchild.tag:
                    if gchild.text is not None:
                        note = cat(note, gchild.text, 'b')
                    if gchild.tail is not None:
                        note = cat(note, gchild.tail)
                if 'bibl' == gchild.tag:
                    note = cat(note, get_text(gchild))
                if 'xref' == gchild.tag:
                    if gchild.find("lemma") is not None:
                        note = cat(note, get_text(gchild.find("lemma")))
                    note = cat(note, get_text(gchild))
                if 'gloss' == gchild.tag or 'emph' == gchild.tag:
                    note = cat(note, gchild.text, 'b')
                    note = cat(note, gchild.tail)
                if 'foreign' == gchild.tag:
                    note = cat(note, get_text(gchild))
print('</notes>')