Source code for pypolibox.realization

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Arne Neumann <arne-neumann@web.de>

"""
The ``realization`` module shall take HLDS XML structures, realize them with
the OpenCCG surface realizer and parse its output string.
"""

import os
import re
import pexpect
import time
from tempfile import NamedTemporaryFile
from commands import getstatusoutput
from copy import deepcopy

from hlds import (Diamond, Sentence, diamond2sentence, add_nom_prefixes,
                  create_hlds_file)


if __name__ == '__main__':
    GRAMMAR_DIR = 'grammar'
else:
    GRAMMAR_DIR = os.path.join(os.path.dirname(__file__), 'grammar')


[docs]class OpenCCG(object): """ command-line interaction with OpenCCG's ``tccg`` parser/generator, which can either be run as a JSON-RPC server or simply imported as a Python module. """ def __init__(self, grammar_dir=GRAMMAR_DIR, lang='de'): """ spawns the OpenCCG/tccg server as a process Parameters ---------- grammar_dir : path to the directory that contains the grammar """ current_dir = os.getcwd() grammar_path = os.path.join(GRAMMAR_DIR, lang) tccg_binary = "tccg" os.chdir(grammar_path) self._server = pexpect.spawn(tccg_binary) print "starting tccg as a server with this path: %s" % tccg_binary os.chdir(current_dir) print "checking tccg settings ..." self._server.expect("\ntccg>") # wait for the tccg input prompt print "Okay, here you go." print "current settings:\n{0}".format(self.parse(":sh"))
[docs] def parse(self, text, verbose=True, raw_output=True): """ This is the core interaction with the parser. It returns a Python data-structure, while the parse() function returns a JSON object :return: if raw_output=True, the raw response string from the server will be returned. otherwise, a list of dictionaries will be returned (one for each input sentence). :rtype: ``str`` OR ``list`` of ``dict``s """ # clean up anything leftover while True: try: # the second argument is a forced delay (in seconds) # EVERY parse must incur. # TODO make this as small as possible. ch = self._server.read_nonblocking (4000, 3) #originally: 0.3s except pexpect.TIMEOUT: break self._server.sendline(text) # How much time should we give the parser to parse it? max_expected_time = 20.0 if verbose: print "Timeout", max_expected_time end_time = time.time() + max_expected_time incoming = "" while True: # Time left, read more data try: ch = self._server.read_nonblocking (2000, 2) freshlen = len(ch) time.sleep (0.0001) incoming = incoming + ch if "\ntccg>" in incoming: break except pexpect.TIMEOUT: if verbose: print "Timeout" if end_time - time.time() < 0: return {'error': "timed out after %f seconds" % max_expected_time, 'input': text, 'output': incoming} else: continue except pexpect.EOF: break if raw_output == True: # plain text results returned by ``tccg`` return incoming else: # return parsed results return "I can't parse tccg's output, yet!"
[docs] def realize(self, featstruct, raw_output=True): """ converts a ``Diamond`` or ``Sentence`` feature structure into HLDS-XML, write it to a temporary file, realizes this file with ``tccg`` and parses the output it returns. :type featstruct: ``Diamond`` or ``Sentence`` """ temp_sentence = deepcopy(featstruct) if isinstance(featstruct, Diamond): temp_sentence = diamond2sentence(temp_sentence) add_nom_prefixes(temp_sentence) sentence_xml_str = create_hlds_file(temp_sentence, mode="realize", output="xml") tmp_file = open("pypolibox-tccg.tmp", "w") tmp_file.write(sentence_xml_str) tmp_file.close() tmp_file_path = os.path.abspath(tmp_file.name) self.tccg_output = self.realize_hlds(tmp_file_path) #os.remove(tmp_file_path) return parse_tccg_generator_output(self.tccg_output)
[docs] def realize_hlds(self, hlds_xml_filename): tccg_command_string = ":r {0}".format(hlds_xml_filename) return self.parse(tccg_command_string, verbose=False)
[docs] def terminate(self): self._server.terminate()
[docs]def parse_tccg_generator_output(tccg_output): """ parses the output string returned from tccg's interactive generator shell. """ results = [] output_lines = tccg_output.splitlines()[1:-1] #remove 1st + last line result_regex = re.compile("\[\d\.\d+\] (.*?) :-") for line in output_lines: match = result_regex.match(line) if match: results.append(match.groups()[0]) else: raise Exception, "Can't parse tccg output line:\n{0}".format(line) return sorted(set(results))