Source code for pypolibox.textplan

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Arne Neumann <arne-neumann@web.de>

# TODO: change Sequence to Joint (just change the name). Sequence is a
# multinuclear relation which combines predecessors and successors. Joint
# is a random relation without any restrictions.

"""
The ``textplan`` module is based on Nicholas FitzGerald's ``py_docplanner``[1], 
in particular on his idea to represent RST trees as attribute value matrices 
by using the ``nltk.featstruct`` data structure.

``textplan`` converts ``Proposition`` instances into ``Message``s (using 
attribute value notation). Via a set of ``Rule``s, these messages are combined 
into ``ConstituentSet``s. Rules are applied bottom-up, via a recursive 
best-first search (cf. ``__bottom_up_search``).

Not only messages, but also constituent sets can be combined 
via rules. If all messages present can be combined into one large 
``ConstituentSet``, this constituent set is called a ``TextPlan``. A 
``TextPlan`` represents a complete text plan in form of an attribute value 
matrix.

[1] Fitzgerald, Nicholas (2009). Open-Source Implementation of Document 
Structuring Algorithm for NLTK. 
"""


import nltk
from nltk.featstruct import Feature, FeatDict
from lxml import etree
from time import time

from util import (flatten, freeze_all_messages, msgs_instance_to_list_of_msgs,
                  ensure_unicode)
from rules import Rules, ConstituentSet
from messages import Message, Messages
from hlds import etreeprint # TODO: dbg, rm


[docs]class TextPlan(nltk.featstruct.FeatDict): """ ``TextPlan`` is the output of Document Planning. A TextPlan consists of an optional title and text, and a child ``ConstituentSet``. TODO: append __str__ method: should describe verbally if a TP is describing one book or comparing two books """ def __init__(self, book_score=None, dtype='TextPlan', text=None, children=None): self[nltk.featstruct.Feature('type', display='prefix')] = dtype self['title'] = nltk.featstruct.FeatDict({'text':text, 'book score': book_score}) self['children'] = children
[docs]class TextPlans(object): """ generates all ``TextPlan``s for an ``AllMessages`` instance, i.e. one DocumentPlan for each book that is returned as a result of the user's database query """ def __init__ (self, allmessages, debug=False): #generate all ``Rule``s that the ``Message``s will be checked against rules = Rules().rules self.document_plans = [] for index, book in enumerate(allmessages.books): before = time() messages = book.messages.values() #all messages about a single book plan = generate_textplan(messages, rules, book.book_score) after = time() time_diff = after - before self.document_plans.append(plan) if debug == True: print "Plan {0}: generated in {1} seconds.\n".format(index, time_diff, plan) book_title = book.messages['id']['title'] if index > 0: lastbook = allmessages.books[index-1] lastbook_title = lastbook.messages['id']['title'] print "Comparing '{0}' " \ "with '{1}':\n\n{2}".format(book_title, lastbook_title, plan) else: print "Describing '{0}':\n\n{1}".format(book_title, plan)
[docs]def generate_textplan(messages, rules=Rules().rules, book_score = None, dtype = 'TextPlan', text = ''): """ The main method implementing the Bottom-Up document structuring algorithm from "Building Natural Language Generation Systems" figure 4.17, p. 108. The method takes a list of ``Message``s and a set of ``Rule``s and creates a document plan by repeatedly applying the highest-scoring Rule-application (according to the Rule's heuristic score) until a full tree is created. This is returned as a ``TextPlan`` with the tree set as ``children``. If no plan is reached using bottom-up, ``None`` is returned. :param messages: a list of ``Message``s which have been selected during content selection for inclusion in the TextPlan :type messages: list of ``Message``s :param rules: a list of ``Rule``s specifying relationships which can hold between the messages :type rules: list of ``Rule``s :param dtype: an optional type for the document :type dtype: string :param text: an optional text string describing the document :type text: string :return: a document plan. if no plan could be created: return None :rtype: ``TextPlan`` or ``NoneType`` """ if isinstance(messages, list): frozen_messages = freeze_all_messages(messages) elif isinstance(messages, Messages): book_score = messages.book_score message_list = msgs_instance_to_list_of_msgs(messages) frozen_messages = freeze_all_messages(message_list) messages_set = set(frozen_messages) # remove duplicate messages ret = __bottom_up_search(messages_set, rules) if ret: # if __bottom_up_search has found a valid plan ... children = ret.pop() # pop returns an 'arbitrary' set element (there's only one) return TextPlan(book_score=book_score, dtype=dtype, text=text, children=children) else: return None
def __bottom_up_search(messages, rules): """generate_text() helper method which performs recursive best-first-search :param messages: a set containing ``Message``s and/or ``ConstituentSet``s :type messages: ``set`` of ``Message``s or ``ConstituentSet``s :param rules: a list of ``Rule``s specifying relationships which can hold between the messages :type rules: ``list`` of ``Rule``s :return: a set containing one ``Message``, i.e. the first valid plan reached by best-first-search. returns None if no valid plan is found. :rtype: ``NoneType`` or a ``set`` of (``Message``s or ``ConstituentSet``s) """ if len(messages) == 1: return messages elif len(messages) < 1: raise Exception('Error: Input contains no messages.') else: try: options = [rule.get_options(messages) for rule in rules] except: raise Exception('ERROR: Rule {0} had trouble with these ' \ 'messages: {1}'.format(rule, messages)) options = flatten(options) options_list = [] for x, y, z in options: y.freeze() options_list.append( (x, y, z) ) if options_list == []: return None #sort all options by their score, beginning with the highest one sorted_options = sorted(options_list, key = lambda (x,y,z): x, reverse=True) for (score, rst_relation, removes) in sorted_options: """ rst_relation: a ConstituentSet (RST relation) that was generated by Rule.get_options() removes: a list containing those messages that are now part of 'rst_relation' and should therefore not be used again """ testSet = messages - set(removes) testSet = testSet.union(set([rst_relation])) # a set containing a ConstituentSet and one or more Messages that # haven't been integrated into a structure yet ret = __bottom_up_search(testSet, rules) if ret: return ret return None
[docs]def linearize_textplan(textplan): """ takes a text plan (an RST tree represented as a NLTK.featstruct data structure) and returns an ordered list of ``Message``s for surface generation. :type textplan: ``TextPlan`` :rtype: ``list`` of ``Message``s """ return [elem for elem in textplan.walk() if type(elem) is Message]
[docs]def textplans2xml(textplans): """ converts several ``TextPlan``s into an XML structure representing these text plans. :type textplans: a ``TextPlans`` instance :rtype: ``etree._ElementTree`` """ root = etree.Element("xml") for textplan in textplans.document_plans: xml_textplan = __textplan_header2xml(root, textplan) # text plans are inserted into the ElementTree doc = etree.ElementTree(root) return doc
[docs]def textplan2xml(textplan): """ converts one ``TextPlan`` into an XML structure representing it. :type textplans: ``TextPlan`` :rtype: ``etree._ElementTree`` """ root = etree.Element("xml") xml_textplan = __textplan_header2xml(root, textplan) # text plan inserted # into the ElementTree doc = etree.ElementTree(root) return doc
def __textplan_header2xml(tree_root, textplan): """ helper function for textplan2xml() and textplans2xml(). extracts meta data from the text plan (book score etc.), calls __textplantree2xml to convert the actual text plan to XML and inserts both into the tree_root XML structure. :type tree_root: ``etree._Element`` :param tree_root: the root element of the resulting text plan XML structure :type textplan: ``TextPlan`` :rtype: ``etree._Element`` :return: one <textplan></textplan> XML structure """ xml_textplan = etree.SubElement(tree_root, "textplan") book_score = str(textplan["title"]["book score"]) document_type = textplan[Feature("type")] target_string = textplan["title"]["text"] header = etree.SubElement(xml_textplan, "header", score=book_score, type=document_type) target = etree.SubElement(header, "target") target.text = target_string rst_tree = __textplantree2xml(textplan["children"]) xml_textplan.insert(1, rst_tree) return xml_textplan def __textplantree2xml(tree): """ helper function for __textplan_header2xml() which converts the actual text plan into XML. :type tree: ``ConstituentSet`` or ``Message`` :rtype: ``etree._Element`` """ if isinstance(tree, ConstituentSet): relation_type = tree[Feature("relType")] nucleus_tree = __textplantree2xml(tree[Feature("nucleus")]) satellite_tree = __textplantree2xml(tree[Feature("satellite")]) relation = etree.Element("relation", type=relation_type) nucleus = etree.SubElement(relation, "nucleus") nucleus.insert(0, nucleus_tree) satellite = etree.SubElement(relation, "satellite") satellite.insert(0, satellite_tree) return relation elif isinstance(tree, Message): return __message2xml(tree) def __message2xml(message): """ converts a single ``Message`` into an XML structure. :type message: ``Message`` :rtype: ``etree._Element`` """ msg_type = message[Feature("msgType")] msg = etree.Element("message", type=msg_type) msg_elements = [(key, val) for (key, val) in message.items() if key != Feature("msgType")] for key, val in msg_elements: if isinstance(key, str): if isinstance(val, FeatDict): #relative length or recency __message_strkey_featdictval2xml(msg, key, val) elif isinstance(val, tuple): # (value, rating) tuple __message_strkey_tupleval2xml(msg, key, val) else: #if isinstance(key, Feature): __message_featurekey2xml(msg, key, val) return msg def __message_strkey_featdictval2xml(msg, key, val): """ helper function for __message2xml(). converts book length and recency information to XML. :type xml_msg: ``etree._Element`` :type msg_key: ``Feature`` :type msg_val: ``frozenset`` or ``str`` """ rating = val["rating"] if "type" in val: # length or recency (RelativeVariation) val_type = val["type"] direction = val["direction"] number = str(val["magnitude"]["number"]) unit = val["magnitude"]["unit"] msgkey = etree.SubElement(msg, key, rating=rating, type=val_type) msgval = etree.SubElement(msgkey, "magnitude", number=number, unit=unit, direction=direction) else: # length or recency (extra) description = val["description"] msgkey = etree.SubElement(msg, key, description=description, rating=rating, type="extra") def __message_strkey_tupleval2xml(msg, key, val): """ helper function for __message2xml(). converts those parts of a message to XML whose key is a string (i.e. most of them). :type xml_msg: ``etree._Element`` :type msg_key: ``Feature`` :type msg_val: ``frozenset`` or ``str`` """ value, rating = val if isinstance(value, frozenset): # authors, keywords, proglangs etc. msgkey = etree.SubElement(msg, key, rating=rating) for element in value: msgval = etree.SubElement(msgkey, "value") msgval.text = ensure_unicode(element) else: # isinstance(value, (str, int)) # title, language, year, pages etc. msgkey = etree.SubElement(msg, key, value=ensure_unicode(value), rating=rating) def __message_featurekey2xml(xml_msg, msg_key, msg_val): """ helper function for __message2xml(). converts those parts of a message to XML whose key is of type ``Feature``, i.e. *reference_authors* and *reference_title*. :type xml_msg: ``etree._Element`` :type msg_key: ``Feature`` :type msg_val: ``frozenset`` or ``str`` """ value, rating = msg_val if isinstance(value, frozenset): # *reference_authors* featkey = etree.SubElement(xml_msg, msg_key.name, feature="true", rating=rating) for element in value: featval = etree.SubElement(featkey, "value") featval.text = str(element) else: # isinstance(value, str) # *reference_title* featkey = etree.SubElement(xml_msg, msg_key.name, feature="true", value=ensure_unicode(value), rating=rating)
[docs]def test_textplan2xml_conversion(): """ test text plan to XML conversion with all the text plans that were generated for all test queries with debug.gen_all_textplans(). """ import cPickle atp = cPickle.load(open("data/alltextplans.pickle", "r")) print """### Output: One XML file per query ###\n\n""" for atp_count, textplans in enumerate(atp): xml_plan = textplans2xml(textplans) print "All text plans generated" \ " for test query #{0}:\n".format(atp_count) print etreeprint(xml_plan, debug=False), "\n\n\n" print """### Output: One XML file per text plan ###\n\n""" for atp_count, tp in enumerate(atp): for tp_count, docplan in enumerate(tp.document_plans): xml_plan = textplan2xml(docplan) print "Text plan {0} for " \ "test query {1}:\n".format(atp_count, tp_count) print etreeprint(xml_plan, debug=False), "\n\n\n"
if __name__ == "__main__": test_textplan2xml_conversion()