Source code for pypolibox.hlds

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Arne Neumann <arne-neumann@web.de>

r"""
HLDS (Hybrid Logic Dependency Semantics) is the format internally used by the 
OpenCCG realizer. This module shall allow the conversion between HLDS-XML 
files and NLTK feature structures. In addition, it can also be used as a
commandline to convert HLDS-XML files in printable versions of
``nltk.FeatStruct``s. The following command produces a LaTeX file that can be
compiled into a PDF::

    python hlds.py --format latex --outfile output.tex input1.xml input2.xml

Alternatively, you can also produce 'ASCII art' with this command::

    python hlds.py --format nltk --outfile output.tex input1.xml input2.xml

This way, the phrase 'das Buch' can be transformed from this HLDS-XML
representation::

    <?xml version="1.0" encoding="UTF-8"?>
    <xml>
      <lf>
        <satop nom="b1:artefaktum">
          <prop name="Buch"/>
          <diamond mode="NUM">
            <prop name="sing"/>
          </diamond>
          <diamond mode="ART">
            <nom name="d1:sem-obj"/>
            <prop name="def"/>
          </diamond>
        </satop>
      </lf>
      <target>das Buch</target>
    </xml>

To this attribute-value matrix (LaTeX)::

    \begin{avm}
        \[ $*$nom$*$  & `b1:artefaktum' \\
           $*$prop$*$ & `Buch' \\
           $*$text$*$ & `das Buch' \\
           NUM        & \[ prop & `sing' \] \\
           ART        & \[ nom  & `d1:sem-obj' \\
                           prop & `def' \] \\
        \]
    \end{avm}

or this one (plain text)::

    [ *root_nom*        = 'b1:artefaktum'           ]
    [ *root_prop*       = 'Buch'                    ]
    [ *text*            = 'das Buch'                ]
    [                                               ]
    [ 00__NUM           = [ *mode* = 'NUM'  ]       ]
    [                     [ prop   = 'sing' ]       ]
    [                                               ]
    [                     [ *mode* = 'ART'        ] ]
    [ 01__ART           = [ nom    = 'd1:sem-obj' ] ]
    [                     [ prop   = 'def'        ] ] 
"""

import argparse
import sys
import re
import random
from collections import defaultdict
from operator import itemgetter
from lxml import etree
from lxml.builder import ElementMaker
import nltk
from nltk.featstruct import Feature, FeatDict
from util import ensure_utf8, ensure_unicode, write_to_file


[docs]class HLDSReader():
    """ 
    represents a list of sentences (as NLTK feature structures) parsed from 
    an HLDS XML testbed file.
    """
    def __init__(self, hlds, input_format="file"):
        """
        :type hlds: ``str`` or ``file``
        :param hlds: an HLDS XML testbed file (either a file object or a 
        string, depending on the input_format parameter)
        
        :type input_format: ``str``
        :param input_format: "string" or "file"
        """        
        if input_format == "string":
            tree = etree.fromstring(hlds)
            self.parse_sentences(tree)
            
        elif input_format == "file":
            tree = etree.parse(hlds)
            self.parse_sentences(tree)
        
[docs]    def parse_sentences(self, tree):
        """
        Parses all sentences (represented as HLDS XML structures) 
        into feature structures. These structures are saved as a list of 
        ``Sentence``s in self.sentences.
        
        If there's only one sentence in a file, it's root element is <xml>. 
        If there's more than one, they are each <xml> sentence "roots" is 
        wrapped in an <item>...</item> (and <regression> becomes the root 
        tag of the document).
        
        :type tree: ``etree._ElementTree``
        :param tree: an etree tree element
        """
        self.sentences = []
        
        self.xml_sentences = tree.findall("item")
        if self.xml_sentences: # there are multiple sentences            
            for sent in self.xml_sentences:
                self.sentences.append(self.parse_sentence(sent, 
                                                          single_sent=False))
                
        else: # no <item> tag --> file contains just one sentence
            root = tree.find("lf/satop") # root (verb) of the sentence
            self.sentences = [self.parse_sentence(root, single_sent=True)]
            
[docs]    def parse_sentence(self, sentence, single_sent=True):
        if single_sent is False:
            item = sentence
            satop = item.find("xml/lf/satop") # root (verb) of the sentence

            # <item numOfParses="4" string="er beschreibt sie">
            sentence_string = ensure_utf8(item.attrib["string"])
            expected_parses = item.attrib["numOfParses"]

        elif single_sent is True:
            satop = sentence
            root = sentence.getroottree()
            target_element = root.find("target")
                        
            if target_element is not None:
                sentence_string = target_element.text
            else:
                sentence_string = ""
            
            expected_parses = 1 
                
        # <satop nom="b1:handlung">
        #   <prop name="beschreiben"/>
        root_prop = "" # some HLDS satop structures don't have a <prop> tag
        if satop.find("prop") is not None:
            root_prop = satop.find("prop").attrib["name"]
        root_nom = satop.attrib["nom"]
        elements = []
        
        for element in satop.findall("diamond"):
            diamond = convert_diamond_xml2fs(element)
            elements.append(diamond)

        sentence = Sentence()
        sentence.create_sentence(sentence_string, expected_parses, 
                                root_nom, root_prop, elements)
        return sentence


[docs]class Sentence(FeatDict):
    """
    represents an HLDS sentence as an NLTK feature structure.
    """
[docs]    def create_sentence(self, sent_str, expected_parses, root_nom, root_prop, 
                        diamonds):
        """         
        wraps all ``Diamond``s that were already constructed by 
        HLDSReader.parse_sentences() plus some meta data (root verb etc.) 
        into a NLTK feature structure that represents a complete sentence.
        
        :type sent_str: ``str``
        :param sent_str: the text that should be generated
        
        :type expected_parses: ``int``
        :param expected_parses: the expected number of parses
        
        :type root_prop: ``str``
        :param root_prop: the root element of that text (in case we're 
        actually generating a sentence: the main verb)
        
        :type root_nom: ``str``
        :param root_nom: the root (element/verb) category, e.g. "b1:handlung"
        
        :type diamonds: ``list`` of ``Diamond``s        
        :param diamonds: a list of the diamonds that are contained in the 
        sentence 
        """
        
        self.update({Feature("text"): sent_str})
        self.update({Feature("expected_parses"): int(expected_parses)})
        self.update({Feature("root_nom"): root_nom})
        if root_prop: # not always present, e.g. when realizing a pronoun
            self.update({Feature("root_prop"): root_prop}) 


        if diamonds:
            for i, diamond in enumerate(diamonds):
                identifier = "{0}__{1}".format(str(i).zfill(2), 
                                               diamond[Feature("mode")])
                self.update({identifier: diamond})

        
[docs]class Diamond(FeatDict):
    """
    A {Diamond} represents an HLDS diamond in form of a (nested) feature 
    structure containing the elements nom? prop? diamond* ::

        <diamond mode="AGENS">
            <nom name="s1:addition"/>
            <prop name="sowohl"/>
            <diamond mode="NP1">
                <nom name="h1:nachname"/>
                <prop name="Hausser"/>
            </diamond>
            ...
        </diamond>
    """    
[docs]    def append_subdiamond(self, subdiamond, mode=None):
        """
        appends a subdiamond structure to an existing diamond structure, while 
        allowing to change the mode of the subdiamond
        
        :type mode: ``str`` or ``NoneType``
        :param mode: the mode that the subdiamond shall have. this will 
        also be used to determine the subdiamonds identifier. if the 
        diamond already has two subdiamonds (e.g. "00__AGENS" and 
        "01__PATIENS") and add a third subdiamond with mode "TEMP", its 
        identifier will be "02__TEMP". if mode is None, the subdiamonds 
        mode will be left untouched.
        """
        index = last_diamond_index(self) + 1
        
        if mode: #change mode only if not None
            subdiamond.update({Feature("mode"): mode})

        identifier = "{0}__{1}".format(str(index).zfill(2), 
                                       subdiamond[Feature("mode")])
        self.update({identifier: subdiamond})

[docs]    def prepend_subdiamond(self, subdiamond_to_prepend, mode=None):
        """
        prepends a subdiamond structure to an existing diamond structure, while 
        allowing to change the mode of the subdiamond
        
        :type subdiamond_to_prepend: ``Diamond``
        
        :type mode: ``str`` or ``NoneType``
        :param mode: the mode that the subdiamond shall have. this will 
        also be used to determine the subdiamonds identifier. if the 
        diamond already has two subdiamonds (e.g. "00__AGENS" and 
        "01__PATIENS") and we'll prepend a third subdiamond with mode 
        "TEMP", its identifier will be "00__TEMP", while the remaining two 
        subdiamond identifiers will will be incremented by 1, e.g. 
        "01__AGENS" and "02__PATIENS". if mode is None, the subdiamonds 
        mode will be left untouched.
        """
        if mode: #change mode only if not None
            subdiamond_to_prepend.update({Feature("mode"): mode})

        # a featstruct is essentially a dictionary, so we'll need to sort it!
        existing_subdiamonds = sorted([(dkey,d) for (dkey,d) in self.items() 
                                            if isinstance(d, Diamond)], 
                                      key=itemgetter(0))
        
        prefixless_subdiamonds = []
        for diamond_key, diamond in existing_subdiamonds:
            prefixless_subdiamonds.append(diamond)
            self.pop(diamond_key)
            
        self.append_subdiamond(subdiamond_to_prepend)
        for diamond in prefixless_subdiamonds:
            self.append_subdiamond(diamond)

[docs]    def insert_subdiamond(self, index, subdiamond_to_insert, mode=None):
        """
        insert a ``Diamond`` into this one before the index, while 
        allowing to change the mode of the subdiamond.
        
        :type index: ``int``
        :type subdiamond_to_insert: ``Diamond``

        :type mode: ``str`` or ``NoneType``
        :param mode: the mode that the subdiamond shall have. this will 
        also be used to determine the subdiamonds identifier. if the 
        diamond already has two subdiamonds (e.g. "00__AGENS" and 
        "01__PATIENS") and we'll insert a third subdiamond at index '1' 
        with mode "TEMP", its identifier will be "01__TEMP", while the 
        remaining two subdiamond identifiers will will be changed 
        accordingly, e.g. "00__AGENS" and "02__PATIENS".
        if mode is None, the subdiamonds mode will be left untouched.
        """
        if mode: #change mode only if not None
            subdiamond_to_insert.update({Feature("mode"): mode})

        # a featstruct is essentially a dictionary, so we'll need to sort it!
        existing_subdiamonds = sorted([(dkey,d) for (dkey,d) in self.items() 
                                            if isinstance(d, Diamond)], 
                                      key=itemgetter(0))
        
        prefixless_subdiamonds = []
        for diamond_key, diamond in existing_subdiamonds:
            prefixless_subdiamonds.append(diamond)
            self.pop(diamond_key)
            
        prefixless_subdiamonds.insert(index, subdiamond_to_insert)
        for diamond in prefixless_subdiamonds:
            self.append_subdiamond(diamond)

[docs]    def change_mode(self, mode):
        """
        changes the mode of a ``Diamond``, which is sometimes needed when 
        embedding it into another ``Diamond`` or ``Sentence``.
        
        :type mode: ``str``
        """
        self[Feature('mode')] = mode


[docs]def create_diamond(mode, nom, prop, nested_diamonds_list):
    """
    creates an HLDS feature structure from scratch (in contrast to 
    convert_diamond_xml2fs, which converts an HLDS XML structure into 
    its corresponding feature structure representation)
    
    NOTE: I'd like to simply put this into __init__, but I don't know how 
    to subclass FeatDict properly. FeatDict.__new__ complains about 
    Diamond.__init__(self, mode, nom, prop, nested_diamonds_list) having 
    too many arguments.
    
    :type mode: ``Str``
    :type nom: ``Str``
    :type prop: ``Str``
    :type nested_diamonds_list: ``list``
    """
    diamond = Diamond()
    diamond[Feature('mode')] = mode

    if nom:
        diamond.update({"nom": nom})
    if prop:
        diamond.update({"prop": prop})
    if nested_diamonds_list:
        for i, nested_diamond in enumerate(nested_diamonds_list):
            identifier = "{0}__{1}".format(str(i).zfill(2), 
                                           nested_diamond[Feature("mode")])
            diamond.update({identifier: nested_diamond})
    return diamond


[docs]def convert_diamond_xml2fs(etree):
    """
    transforms a HLDS XML <diamond>...</diamond> structure 
    (that was parsed into an etree element) into an NLTK feature structure.

    :type etree_or_tuple: ``etree._Element``
    :param etree_or_tuple: a diamond etree element
    
    :rtype: ``Diamond``
    """
    mode = ensure_utf8(etree.attrib["mode"])

    nested_diamonds = []
    nom = "" # default value
    prop = "" # default value
    
    for child in etree.getchildren():
        if child.tag == "diamond":
            nested_diamond = convert_diamond_xml2fs(child)
            nested_diamonds.append(nested_diamond)
        elif child.tag == "nom":
            nom = ensure_utf8(child.attrib["name"])
        elif child.tag == "prop":
            prop = ensure_utf8(child.attrib["name"])

    return create_diamond(mode, nom, prop, nested_diamonds)
 

[docs]def hlds2xml(featstruct):
    """
    debug function that returns the string representation of a feature 
    structure (Diamond or Sentence) and its HLDS XML equivalent.
    
    :type featstruct: ``Diamond`` or ``Sentence``
    :rtype: ``str``
    """
    assert isinstance(featstruct, (Diamond, Sentence))
    input_str = featstruct.__str__()
    
    if isinstance(featstruct, Diamond):
        sentence = diamond2sentence(featstruct)
        output_str = create_hlds_file(sentence, mode="realize", 
                                         output="xml")
    
    if isinstance(featstruct, Sentence):
        output_str = create_hlds_file(featstruct, mode="realize", 
                                         output="xml")
 
    return "Input:\n\n{0}\n\nOutput:\n\n{1}".format(input_str, output_str)


[docs]def create_hlds_file(sent_or_sent_list, mode="test", output="etree"):
    """
    this function transforms ``Sentence``s into a a valid HLDS XML testbed file
    
    :type sent_or_sent_list: ``Sentence`` or ``list`` of ``Sentence``s
    :param sent_or_sent_list: a ``Sentence`` or a list of ``Sentence``s

    :type mode: ``str``    
    :param mode: "test", if the sentence will be part of a (regression) 
    testbed file (ccg-test). "realize", if the sentence will be put in a 
    file on its own (ccg-realize).
    
    :type output: ``str``
    :param output: "etree" (etree element) or "xml" (formatted, valid xml 
    document as a string)
    
    :rtype: ``str``
    """
    if mode is "test":
        root = etree.Element("regression")
        doc = etree.ElementTree(root)
    
        etree_sentences = []
        
        if type(sent_or_sent_list) is list:
            for sentence in sent_or_sent_list:
                item = __sentence_fs2xml(sentence, mode="test")
                etree_sentences.append(item)
        
            for sentence_etree in etree_sentences:
                final_position = len(root)
                root.insert(final_position, sentence_etree)
            
        elif type(sent_or_sent_list) is Sentence:
            sentence_etree = __sentence_fs2xml(sent_or_sent_list, mode="test")
            final_position = len(root)
            root.insert(final_position, sentence_etree)

    elif mode is "realize":
        root = etree.Element("xml")
        doc = etree.ElementTree(root)
        
        if type(sent_or_sent_list) is Sentence:
            sentence_etree = __sentence_fs2xml(sent_or_sent_list, 
                                               mode="realize")
        elif type(sent_or_sent_list) is list and len(sent_or_sent_list) == 1:
            sentence_etree = __sentence_fs2xml(sent_or_sent_list[0], 
                                               mode="realize")
        else:
            raise Exception, \
                "ValueError: in 'realize' mode, sent_or_sent_list should be " \
                "one Sentence or a list containing only one Sentence."
        root.insert(0, sentence_etree)
        
    if output == "etree":
        return doc
    elif output == "xml":
        return etreeprint(doc, debug=False)


def __sentence_fs2xml(sentence, mode="test"):
    """    
    transforms a sentence (in NLTK feature structure notation) into its 
    corresponding HLDS XML <item></item> structure.
    
    :type sentence: ``Sentence``
    :param sentence: a sentence in NLTK feature structure notation
    
    :type mode: ``str``    
    :param mode: "test", if the sentence will be part of a (regression) 
    testbed file (ccg-test). "realize", if the sentence will be put in a 
    file on its own (ccg-realize).
    
    :rtype: ``etree._Element``
    :return: the input sentence in HLDS XML format (represented as an etree 
    element)
    """
    if mode is "test":
        expected_parses = sentence[Feature("expected_parses")]
        text = sentence[Feature("text")]
        item = etree.Element("item", numOfParses=str(expected_parses),
                             string=ensure_unicode(text))
        xml = etree.SubElement(item, "xml")
        lf = etree.SubElement(xml, "lf")
    else: # mode is "realize"
        lf = etree.Element("lf")
    
    root_nom = sentence[Feature("root_nom")]
    satop = etree.SubElement(lf, "satop", nom=root_nom)

    if Feature("root_prop") in sentence:
        root_prop = sentence[Feature("root_prop")]
        etree.SubElement(satop, "prop", name=root_prop)
    
    diamonds = []
    for key in sorted(sentence.keys()):
    # keys need to be sorted, otherwise Diamonds within a Sentence will have a
    # different order than before. Diamond keys seem ordered, but they aren't
    # (keys beginning with numbers seem to be in descending order, those 
    # beginning with letters in ascending order)
        if isinstance(sentence[key], Diamond):
            diamonds.append(sentence[key])
    
    etree_diamonds = []
    for diamond in diamonds:
        etree_diamonds.append(__diamond_fs2xml(diamond))
        
    for diamond in etree_diamonds:
        final_position = len(satop)
        satop.insert(final_position, diamond)
   
    if mode is "test":
        return item
    else:
        return lf
 
def __diamond_fs2xml(diamond):
    """
    converts a {Diamond} feature structure into its corresponding HLDS 
    XML structure (stored in an etree element).
    
    :type diamond: ``Diamond``
    :param diamond: a Diamond feature structure containing nom? prop? diamond* 
    elements
    
    :rtype: ``etree._Element``
    :return: a Diamond in HLDS XML tree notation, represented as an etree 
    element
    """
    E = ElementMaker()
    NOM = E.nom
    PROP = E.prop
    DIAMOND = E.diamond

    diamond_etree = DIAMOND(mode=ensure_unicode(diamond[Feature("mode")]))
    
    if "prop" in diamond:    
        diamond_etree.insert(0, PROP(name=ensure_unicode(diamond["prop"])) )
    if "nom" in diamond:
    # if present, nom(inal) has to be the first argument/sub tag of a diamond
        diamond_etree.insert(0, NOM(name=ensure_unicode(diamond["nom"])) )

    subdiamonds = []    
    for key in sorted(diamond.keys()):
    # keys need to be sorted, otherwise Diamonds within a Sentence will have a
    # different order than before. Diamond keys seem ordered, but they aren't
    # (keys beginning with numbers seem to be in descending order, those 
    # beginning with letters in ascending order)
        if isinstance(diamond[key], Diamond):
            subdiamonds.append(diamond[key])
    
    etree_subdiamonds = []    
    for subdiamond in subdiamonds:
        etree_subdiamonds.append(__diamond_fs2xml(subdiamond))
        
    for subdiamond in etree_subdiamonds:
        final_position = len(diamond_etree)
        diamond_etree.insert(final_position, subdiamond)
        
    return diamond_etree

[docs]def diamond2sentence(diamond):
    """
    Converts a Diamond feature structure into a Sentence feature structure. 
    This becomes necessary whenever we want to realize a short utterance, e.g. 
    "die Autoren" or "die Themen Syntax und Pragmatik". 
    
    Note: OpenCCG does not really distinguish between a sentence and smaller 
    units of meaning. It simply assigns the <sentence> tag to every HLDS 
    structure it realizes, whereas each substructure of this "sentence" (no 
    matter how complex) is labelled as a <diamond>.
    
    :type diamond: ``Diamond``
    :rtype: ``Sentence``
    """
    nom = ""
    prop = ""
    sentence = Sentence()
    if "nom" in diamond:
        nom = diamond["nom"]
    if "prop" in diamond:
        prop = diamond["prop"]

    nested_diamonds = []
    for key in sorted(diamond.keys()):
    # keys need to be sorted, otherwise Diamonds within a Sentence will have a
    # different order than before. Diamond keys seem ordered, but they aren't
    # (keys beginning with numbers seem to be in descending order, those 
    # beginning with letters in ascending order)
        if isinstance(diamond[key], Diamond):
            nested_diamonds.append(diamond[key])

    sentence.create_sentence("", 1, nom, prop, nested_diamonds)
    return sentence


[docs]def test_conversion():
    """    
    tests HLDS XML <-> NLTK feature structures conversions. converts an 
    HLDS XML testbed file into a list of sentences in NLTK feature 
    structure. picks one of these sentences randomly and converts it back 
    to HLDS XML. prints boths versions of this sentence. returns an 
    HLDSReader instance (containing a list of ``Sentence``s in NLTK feature 
    structure notation) and a HLDS XML testbed file (as a string) created 
    from those feature structures.
    
    :rtype: ``tuple`` of (``HLDSReader``, ``str``) 
    :return: a tuple containing an HLDSReader instance and a string 
    representation of an HLDS XML testbed file
    """
    hlds_reader = HLDSReader(testbed_file, input_format="file")
    random_sent_index = random.randrange(0, len(hlds_reader.sentences))
    random_sent_fs = \
        hlds_reader.sentences[random_sent_index]
    random_sent_xml = create_hlds_file([random_sent_fs], output="xml")
    all_sents_xml = create_hlds_file(hlds_reader.sentences, output="xml")
                                              
    print "random sentence: %i\n" % random_sent_index, random_sent_fs, \
          "\n" * 3, random_sent_xml
    return hlds_reader, all_sents_xml 


[docs]def add_mode_suffix(diamond, mode="N"):
    matching_subdiamond_keys = []
    for key in diamond.keys():
        if isinstance(key, str) and key.endswith(mode):
            if diamond[key][Feature("mode")] == mode:
                matching_subdiamond_keys.append(key)
                
    sorted_subdiamond_keys = sorted(matching_subdiamond_keys)
    for i, key in enumerate(sorted_subdiamond_keys):
        diamond[key][Feature("mode")] = "{0}{1}".format(mode, i+1)

    for key, value in diamond.items():
        if isinstance(value, Diamond):
            add_mode_suffix(value, mode)


[docs]def add_nom_prefixes(diamond):
    """
    Adds a prefix/index to the name attribute of every <nom> tag of a 
    ``Diamond`` or ``Sentence`` structure. Without this, ``ccg-realize`` will 
    only produce gibberish.
    
    Every <nom> tag has a 'name' attribute, which contains a category/type-like
    description of the corresponding <prop> tag's name attribute, e.g.::
    
        <diamond mode="PRÄP">
            <nom name="v1:zugehörigkeit"/>
            <prop name="von"/>
        </diamond>

    Here 'zugehörigkeit' is the name of a category that the preposition 
    'von' belongs to. usually, the nom prefix is the first character of the 
    prop name attribute with an added index. index iteration is done by a 
    depth-first walk through all diamonds contained in the given feature 
    structure. In this example 'v1:zugehörigkeit' means, that "von" is the 
    first ``diamond`` in the structure that starts with 'v' and belongs to 
    the category 'zugehörigkeit'.
    """
    prop_dict = defaultdict(int)
    elements = [element for element in diamond.walk()]

    for e in elements:
        if type(e) is Diamond:
            if "nom" in e.keys():
                nom_prefix_char = __determine_nom_prefix(e)
                    
                prop_dict[nom_prefix_char] += 1
                nom_without_prefix = e["nom"]
                nom_type = type(nom_without_prefix)
                e["nom"] = "{0}{1}:{2}".format(ensure_utf8(nom_prefix_char), 
                                               prop_dict[nom_prefix_char],
                                               ensure_utf8(nom_without_prefix))
                if nom_type == unicode:
                # preserve unicode, if the string was unicode encoded before
                    e["nom"] = ensure_unicode(e["nom"])


def __determine_nom_prefix(diamond):
    """
    determines, which character will be used as a prefix for <nom> tags. 
    usually, its the first character used in the corresponding <prop> tag, 
    (e.g. prop = "und" will turn nom = "konjunktion" into nom = 
    "u1:konjunktion", iff its the 1st "konjunktion" beginning with "u" in 
    that sentence).
    
    :type diamond: ``Diamond``
    
    :rtype: ``str``
    :return: a single character
    """
    numbers_only = re.compile("\d+$")
    
    if "prop" in diamond.keys():
        prop = ensure_utf8(diamond["prop"])
        if numbers_only.match(prop):
            nom_prefix_char = "n"
        else: # <prop> doesn't represent a year, page count etc.
            nom_prefix_char = diamond["prop"].lower()[0]
        
    else: #if there's no <prop> tag
        nom_prefix_char = "x"
    
    return ensure_utf8(nom_prefix_char)
    

[docs]def remove_nom_prefixes(diamond):
    prop_dict = defaultdict(int)
    elements = [element for element in diamond.walk()]
    prefix = re.compile("\w\d+:")

    for e in elements:
        if type(e) is Diamond:
            if "nom" in e.keys():
                if prefix.match(e["nom"]):
                    e["nom"] = prefix.split(e["nom"], maxsplit=1)[1]


[docs]def last_diamond_index(featstruct):
    """
    Returns the highest index currently used withing a given ``Diamond`` or 
    ``Sentence``. E.g., if this structure contains three diamonds 
    ("00__ART", "01__NUM" and "02__TEMP"), the return value will be 2. The 
    return value is -1, if the feature structure doesn't contain any 
    ``Diamond``s.
    
    :type featstruct: ``Diamond`` or ``Sentence``
    :rtype: ``int``
    """
    diamond_keys = [k for (k,v) in featstruct.items() if isinstance(v, Diamond)]
    return len(diamond_keys) - 1


[docs]def featstruct2avm(featstruct, mode="non-recursive"):
    """
    converts an NLTK feature structure into an attribute-value matrix
    that can be printed with LaTeX's avm environment.

    :type featstruct: ``nltk.featstruct`` or ``Diamond`` or ``Sentence``
    :rtype: ``str``
    """
    ret_str = "\[ "
    for key, val in sorted(featstruct.items()):

        if isinstance(val, Diamond): #handles nested Diamond structures
            diamond_key = val[Feature("mode")]
            diamond_val = featstruct2avm(val, mode="recursive")
            ret_str += "{0} & {1} \\\\\n".format( ensure_utf8(diamond_key),
                                                  ensure_utf8(diamond_val))

        elif isinstance(val, nltk.FeatStruct):
        #every other subclass of FeatStruct incl. FeatStruct
            nested_featstruct = featstruct2avm(val, mode="recursive")
            ret_str += "{0} & {1} \\\\\n".format( ensure_utf8(key),
                                                  ensure_utf8(nested_featstruct))
            
            
        else: # normal key, value pairs within a FeatStruct
            if key in (Feature("mode"), Feature("expected_parses")):
                continue # don't print "mode" or "expected_parses" keys
            elif key == Feature("root_nom"):
                key = Feature("nom")
            elif key == Feature("root_prop"):
                key = Feature("prop")

            ret_str += "{0} & `{1}' \\\\\n".format( ensure_utf8(key),
                                                    ensure_utf8(val))

    ret_str += " \]\n"

    if mode == "non-recursive":
        clean_ret_str = ret_str.replace("*", "$*$").replace("_", "\_")
        ret_str = "{0}\n{1}{2}".format('\\begin{avm}', clean_ret_str, '\\end{avm}')
    return ret_str


[docs]def etreeprint(element, debug=True, raw=False):
    """pretty print function for etree trees or elements
    
    :type element: ``etree._ElementTree`` or ``etree._Element``
    :param debug: if True: not only return the XML string, but also print it to
    stdout. if False: only return the XML string
    
    :param raw: if True: just transform the etree (element) into a string, 
    don't add or prettify anything. if False: add an XML declaration and use
    pretty print to make the output more readable for humans.
    """
    if raw is False:
        xml_string = etree.tostring(element, xml_declaration=True, 
                                    pretty_print=True, encoding="UTF-8")
    else:
        xml_string = etree.tostring(element, xml_declaration=False, 
                                    pretty_print=False, encoding="UTF-8")        
        
    if debug is True:
        print xml_string
    return xml_string



LATEX_AVM_HEADER = r"""
\documentclass[10pt]{article}
\usepackage{graphicx}
\usepackage{avm}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage[ngerman]{babel}
\begin{document}

"""

[docs]def main():
    """parse command line args and do the conversions"""
    parser = argparse.ArgumentParser(description='convert HLDS XML to nltk.FeatStructs or LaTeX AVMs')
    parser.add_argument('files', metavar='FILE', type=str, nargs='+',
                        help='one or more HLDS XML files to be processed')
    parser.add_argument('-f', '--format', dest='output_format', action='store',
                        default='nltk',
                        help='choose between nltk or latex')
    parser.add_argument('-o', '--outfile', nargs='?',
                        type=argparse.FileType('w'), default=sys.stdout,
                        help='file to write the output to (default: stdout)')

    args = parser.parse_args(sys.argv[1:])
    if args.output_format == 'latex':
        args.outfile.write(LATEX_AVM_HEADER)
        for filename in args.files:
            hlds_reader = HLDSReader(filename, input_format="file")
            for sentence in hlds_reader.sentences:
                avm = featstruct2avm(sentence)
                args.outfile.write(avm)
                args.outfile.write("\n\n")
        args.outfile.write("\end{document}")

    elif args.output_format == 'nltk':
        for filename in args.files:
            hlds_reader = HLDSReader(filename, input_format="file")
            for sentence in hlds_reader.sentences:
                print >>args.outfile, sentence, "\n\n"

    args.outfile.close()


if __name__ == "__main__":
    main()