Source code for pypolibox.facts

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Arne Neumann <arne-neumann@web.de>

"""
The ``facts`` module takes the information stored in ``Book`` instances and 
converts them into attribute value matrices (``Facts``). Furthermore, the 
module compares each book with its predecessor (e.g. book A is newer than book 
B and has code examples, while B is shorter and targets beginners ...). The 
insights gathered from these comparisons are also stored in ``Facts`` 
instances.
"""

import datetime

[docs]class AllFacts():
    """
    Simply speaking, an ``AllFacts`` instance contains all facts about all 
    books that were returned by a database query. More formally, it contains a 
    ``Facts`` instance for each ``Book`` in a ``Books`` instance.
    
    In a ``Books`` instance, all books returned by a database query are sorted 
    by the number of query parameters they match ('user model match') in 
    descending order. This means, that ``AllFacts`` will contain facts about 
    the best-matching book, followed by facts about the second-best matching 
    book (including a comparison to the best matching one), followed by facts 
    about the third-best matching book (including a comparison to the second 
    one) etc.
    """
    def __init__ (self, b):
        """
        generates all facts for all books returned by a database query, i.e. a 
        ``Facts`` instance for each ``Book`` in a ``Books`` instance. For a 
        hands-on description, see the ``Facts`` documentation.

        Parameters
        ----------
        b : Books
            a ``Books`` instance, which contains all ``Book`` instances 
            that were constructed from the database query results.
        """
        self.query_args = b.query_args # original query args for generating query_facts
        self.books = []
        self.book_scores = b.scores
        
        for index, book in enumerate(b.books):
            book_score = self.book_scores[index]
            if index == 0: #first book
                book_facts = Facts(book, book_score, index)
                self.books.append(book_facts)
            else: # every other book --> trigger comparison with preceeding book
                preceding_book = b.books[index-1]
                book_facts = Facts(book, book_score, index, preceding_book)
                self.books.append(book_facts)
                
    def __str__(self):
        """
        print the facts for each book
        """
        return_string = ""
        for index, book in enumerate(self.books):
            return_string += "facts about book #{0}:\n".format(index) + \
                             "--------------------\n" + \
                             "{0}\n\n".format(book)
        return return_string

[docs]class Facts():
    """
    A ``Facts`` instance represents facts about a single book, but also 
    contains a comparison of that particular book with its predecessor. 
    """
    def __init__ (self, book, book_score, index=0, preceding_book=False):
        """
        Uses the facts/metadata retrieved from the sqlite3 database, and 
        generates facts in form of an attribute value matrix. The facts are 
        grouped logically. A ``Facts`` instance basically consists of a 
        dictionary (stored in ``self.facts``) containing these four keys::
        
            (1) 'id_facts'
            (2) 'extra_facts'
            (3) 'query_facts'
            (4) 'lastbook_facts'

        Since this method is basically dealing with a list of ``Book`` 
        instances, the first book's ``Facts`` instance will not contain 
        'lastbook_facts', as there is no previous book in the list that it 
        could be compared to.

        :param book: a ``Book`` instance
        :type book: ``Book``
        
        :param book_score: the score of the book that was calculated in 
        :class:`Books.get_book_ranks()`
        :type book_score: ``float``
        
        :param index: the index of the book in the ``Books`` list of books
        :type index: ``int``
        
        :param preceding_book: if True, there is a book preceding this one 
        and both books will be compared
        :type preceding_book: ``bool``
        """
        facts = {}
                
        facts["id_facts"] = self.generate_id_facts(index, book)
        facts["extra_facts"] = self.generate_extra_facts(index, book)
        facts["query_facts"] = self.generate_query_facts(index, book, book_score)
                
        if preceding_book == False: # if this is the first/only book            
            pass # DON't compare this book to a non-existent preceeding one
        else:
            facts["lastbook_facts"] = self.generate_lastbook_facts(index, book, preceding_book) # generate additional facts, comparing the current with the preceeding book        
        self.facts = facts

[docs]    def generate_id_facts(self, index, book):
        """
        generates a dictionary of id facts about the current book which will be 
        stored in ``self.facts["id_facts"]``. In contrast to other facts, 
        ``id_facts`` are those kind of facts that can be directly retrieved 
        from the database (i.e. there is no comparison between books or 
        reasoning involved). The id_facts dictionary contains the following 
        keys::
        
            id_facts keys       database book table columns
            
            'authors'
            'codeexamples'      'examples'
            'exercises'
            'keywords'
            'language'          'lang'
            'pages'
            'proglang'          'plang'
            'target'
            'title'
            'year'

        The key names should be self-exlanatory. In those cases where they do 
        not exactly match their counterparts in the database, the 
        corresponding database table column name is given in the table above.

        :param index: the index of the book in the ``Books`` list of books
        :type index: ``int``

        :param book: a ``Book`` instance
        :type book: ``Book``

        :return: a dictionary with the keys described above
        :rtype: ``dict``
        """
        id_facts = {}
        attributes = ['authors', 'codeexamples', 'exercises', 'keywords', 
                      'language', 'pages', 'proglang', 'target', 'title', 
                      'year']
        
        for attribute in attributes:
            # Instead of writing lots of repetitive code like in JPolibox:
            #    id_facts["authors"] = book.authors
            #    id_facts["codeexamples"] = book.codeexamples ...
            # we will get all those book attributes at once (with ``getattr``) 
            # and turn them into dictionary items (via ``__setitem__``).
            book_attribute = getattr(book, attribute)
            id_facts.__setitem__(attribute, book_attribute)
                
        return id_facts
        
[docs]    def generate_query_facts(self, index, book, book_score):
        """
        generates facts that describes if a book matches (parts of) the query 
        (a.k.a the user model). a typical query_facts dictionary will look 
        like this::
        
            query_facts:
                usermodel_nomatch: {'codeexamples': 0}
                usermodel_match: {'exercises': 1, 'keywords': 
                                 set(['semantics', 'parsing']), 'language': 
                                 'German'} 
                book_score: 0.8

        The book described in this examples matches 80 % of the user 
        requirements (it contains exercises and deals with semantics and 
        parsing and is written in German) but does not contain code examples 
        (as was asked for by the user).
        
        :param index: the index of the book in the ``Books`` list of books
        :type index: ``int``

        :param book: a ``Book`` instance
        :type book: ``Book``

        :param book_score: the score of the book that was calculated in 
        :class:`Books.get_book_ranks()`
        :type book_score: ``float``
        
        :return: a dictionary that contains three keys, the ``book_score``, 
        the ``usermodel_match`` as well as the ``usermodle_nomatch``. 
        'usermodel_match' contains all the features that were requested by 
        the user and are present in the book. 'usermodle_nomatch' contains 
        all features that were requested but are missing from the book.
        :rtype: ``dict``
        """
        query_facts = {}
        query_facts["book_score"] = book_score
        query_facts["usermodel_match"] = {}
        query_facts["usermodel_nomatch"] = {}
        query_args = book.query_args
        simple_attributes = ['codeexamples', 'exercises', 'language', 
                             'pagerange', 'target']
        complex_attributes = ['keywords', 'proglang'] 
        # complex attributes may contain more than 1 value
        
        for simple_attribute in simple_attributes:
            #if query_args has a non-empty value for this attrib
            if getattr(query_args, simple_attribute):
                if getattr(query_args, simple_attribute) == getattr(book, simple_attribute):
                    query_facts["usermodel_match"][simple_attribute] = getattr(book, simple_attribute)
                else:
                    query_facts["usermodel_nomatch"][simple_attribute] = getattr(book, simple_attribute) 
                    
        for complex_attribute in complex_attributes:
            # if query_args has at least one value for this attrib
            if getattr(query_args, complex_attribute): 
                values = getattr(query_args, complex_attribute)
                matching_values = set()
                nonmatching_values = set()
                for value in values:
                    if value in getattr(book, complex_attribute):
                        matching_values.add(value)
                    else:
                        nonmatching_values.add(value)
                if matching_values != set(): # if not empty ...
                    query_facts["usermodel_match"][complex_attribute] = matching_values
                if nonmatching_values != set():
                    query_facts["usermodel_nomatch"][complex_attribute] = nonmatching_values

        return query_facts
                
[docs]    def generate_lastbook_facts(self, index, book, preceding_book):
        """
        generates facts that compare the current book with the preceding one. 
        A typical example of a lastbook_facts dictionary would look like 
        this::
        
            lastbook_facts:
                lastbook_nomatch: 
                    {'language': 'German', 
                    'keywords_preceding_book_only': 
                        set(['pragmatics', 'chart parsing']), 
                    'keywords_current_book_only':
                        set([' ', 'grammar', 'language hierarchy', 'corpora', 
                            'syntax', 'morphology', 'left associative 
                            grammar']), 
                    'codeexamples': 0, 
                    'proglang': set(['Lisp']), 
                    'newer': 11, 
                    'keywords': 
                        set([' ', 'grammar', 'language hierarchy', 'corpora', 
                        'syntax', 'left associative grammar', 'morphology', 
                        'chart parsing', 'pragmatics']),
                    'proglang_preceding_book_only': 
                        set(['Lisp'])} 
                lastbook_match: 
                    {'exercises': 1, 'keywords': set(['semantics', 
                    'parsing']), 'target': 0, 'pagerange': 1}

        This method will calculate if is newer/older/shorter/longer than its 
        predecessor (if so, it will store the difference as an integer). For 
        keys that have sets as their values (``keywords`` and ``proglang``), 
        the resulting dictionary will list which values differed and which 
        were only present in either the preceding or the current book.
        
        :param index: the index of the book in the ``Books`` list of books
        :type index: ``int``

        :param book: a ``Book`` instance
        :type book: ``Book``
                
        :param preceding_book: if True, there is a book preceding this one 
        and both books will be compared
        :type preceding_book: ``bool``
        
        :return: a dictionary with two keys: ``lastbook_match`` and 
        ``lastbook_nomatch``, which in turn are dictionaries themselves and 
        contain facts that are shared between the two books (lastbook_match) 
        or that differ between the two (lastbook_nomatch). 
        """
        lastbook_facts = {}
        lastbook_facts['lastbook_match'] = {}
        lastbook_facts['lastbook_nomatch'] = {}
        simple_comparisons = ['codeexamples', 'exercises','language', 'target']
        set_comparisons = ['keywords', 'proglang']
        
        for simple_comparison in simple_comparisons:
            if getattr(book, simple_comparison) == getattr(preceding_book, simple_comparison):
                lastbook_facts['lastbook_match'][simple_comparison] = getattr(book, simple_comparison)
            else:
                lastbook_facts['lastbook_nomatch'][simple_comparison] = getattr(book, simple_comparison)
                
        for attribute in set_comparisons:
            current_attrib = getattr(book, attribute)
            preceding_attrib = getattr(preceding_book, attribute)
            if current_attrib == preceding_attrib == set([]):
                pass # nothing to compare
            else:
                shared_values = current_attrib.intersection(preceding_attrib)
                if shared_values != set([]):
                    lastbook_facts['lastbook_match'][attribute] = shared_values
                
                non_shared_values = current_attrib.symmetric_difference(preceding_attrib)
                lastbook_facts['lastbook_nomatch'][attribute] = non_shared_values
                
                current_only_values = current_attrib.difference(preceding_attrib)
                if current_only_values != set([]):
                    fact_name = attribute + '_current_book_only'
                    lastbook_facts['lastbook_nomatch'][fact_name] = current_only_values

                preceding_only_values = preceding_attrib.difference(current_attrib)
                if preceding_only_values != set([]):
                    fact_name = attribute + '_preceding_book_only'
                    lastbook_facts["lastbook_nomatch"][fact_name] = preceding_only_values
 
        if book.year == preceding_book.year:
            lastbook_facts["lastbook_match"]["year"] = book.year
        else:
            if book.year > preceding_book.year:
               years_diff = book.year - preceding_book.year 
               lastbook_facts["lastbook_nomatch"]["newer"] = years_diff
            else:
                years_diff = preceding_book.year - book.year
                lastbook_facts["lastbook_nomatch"]["older"] = years_diff

        if book.pagerange == preceding_book.pagerange:
            lastbook_facts["lastbook_match"]["pagerange"] = book.pagerange
        else:
            if book.pages > preceding_book.pages:
                page_diff = book.pages - preceding_book.pages
                lastbook_facts["lastbook_nomatch"]["longer"] = page_diff
            else: #current book is shorter
                page_diff = preceding_book.pages - book.pages
                lastbook_facts["lastbook_nomatch"]["shorter"] = page_diff
                
        return lastbook_facts
    
[docs]    def generate_extra_facts(self, index, book):
        """
        generates ``extra_facts``, if the current book is very new/old or very 
        short/long.

        :param index: the index of the book in the ``Books`` list of books
        :type index: ``int``

        :param book: a ``Book`` instance
        :type book: ``Book``
        
        :return: a dictionary that contains information about the recency and 
        length of a book
        :rtype: ``dict``
        """
        current_year = datetime.datetime.today().year
        extra_facts = {}
        if book.pages < 100:
            extra_facts["pages"] = "very short"
        if book.pages > 600:
            extra_facts["pages"] = "very long"
        if (current_year - 10) < book.year: # newer than 10 years
            extra_facts["year"] = "recent"
        if (current_year - 30) > book.year: # older than 30 years
            extra_facts["year"] = "old"
        
        return extra_facts

    def __str__(self):
        """prints the ``Facts`` instance, but omits empty values"""
        signifiers_of_emptyness = [ [], {}, set() ] # lists, dicts, sets can be empty (we can't simply say "if val:", since this this would not only exclude emtpy lists/dicts/sets but also "0")
        return_string = ""
        for key, value in self.facts.iteritems():
            if value not in signifiers_of_emptyness:
                return_string += "\n{0}:\n".format(key)
                for attribute, val in value.iteritems():
                    if val not in signifiers_of_emptyness:
                        return_string += "\t{0}: {1}\n".format(attribute, val)
        return return_string