Source code for pypolibox.database

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Arne Neumann <arne-neumann@web.de>

"""
The ``database`` module is responsible for parsing the user's requirements
(both from command line options, as well as interactively from the Python
interpreter), transforming these requirements into an SQL query, querying the
sqlite database and returning the results.
"""

import os
import argparse
import sqlite3
import util

if __name__ == '__main__':
    DB_FILE = 'data/books.sqlite'
else:
    DB_FILE = os.path.join(os.path.dirname(__file__),'data/books.sqlite')

BOOK_TABLE_NAME = 'books' # name of the table in the database file that
                          # contains info about books
DEFAULT_ENCODING = 'UTF8'

[docs]class Query: """ a ``Query`` instance represents one user query to the database Queries can be made from the command line, as well as from the Python interpreter. From the command line, queries can be made using either abbreviated or long parameters. The following examples both query the database for books that contain code examples and deal with both semantics and parsing:: python pypolibox.py -k semantics, parsing -c 1 python pypolibox.py --keywords semantics, parsing --codeexamples 1 When calling ``pypolibox.py`` from within the Python interpreter, the same query can be made using the following command:: Query(["-k", "semantics", "parsing", "-c", "1"]) If you print the ``Query`` instance (by using the ``print`` command), it will return the SQL query that was constructed from the user input:: SELECT * FROM books WHERE keywords like '%semantics%' AND keywords like '%parsing%' AND examples = 1 TODO: This module talks directly to the database. To make it easier to adapt pypolibox to a different domain, an SQL abstraction layer (e.g. SQL Alchemy) should be used. """ def __init__ (self, argv): """ given a list of query arguments, this constructor parses commandline options with argparse, constructs a valid sql query and stores the resulting query strings in self.and_query (using boolean AND to combine the query arguments) and self.or_query (boolean OR). TODO: add max_textplans paramter --> generate only the X highest ranking books :param argv: a list of strings (either parsed from the command line or set programmatically) :type argv: ``list`` of ``str`` """ self.queries = [] self.minresults = 3 query_and = " AND " query_or = " OR " parser = argparse.ArgumentParser() parser.add_argument("-k", "--keywords", nargs='+', help="Which topic(s) should the book cover?") #nargs='+' handles 1 or more args parser.add_argument("-l", "--language", help="Which language should the book have?") parser.add_argument("-p", "--proglang", nargs='+', help="Which programming language(s) should the book use?") parser.add_argument("-s", "--pagerange", type=int, help="book length ranges. 0 = less than 300 pages, " \ "1 = between 300 and 600 pages. 2 = more than 600 pages.") parser.add_argument("-t", "--target", type=int, help="target audience. 0 = beginner, 1 = intermediate" \ "2 = advanced, 3 = professional") parser.add_argument("-e", "--exercises", type=int, help="Should the book contain exercises? 0 = no, 1 = yes") parser.add_argument("-c", "--codeexamples", type=int, help="Should the book contain code examples? 0 = no, 1 = yes") parser.add_argument("-r", "--minresults", type=int, help="show no less than MINRESULTS books") parser.add_argument("-o", "--output-format", default='openccg', help=("output format: openccg, hlds, textplan-xml, textplan-featstruct. " "default: openccg")) parser.add_argument("-d", "--output-language", default='de', help=("output natural language: currently only 'de' for German is supported. " "default: de")) args = parser.parse_args(argv) if args.keywords is not None: for keyword in args.keywords: self.queries.append(self.__substring_query("keywords", keyword)) if args.language is not None: self.queries.append(self.__string_query("lang", args.language)) if args.proglang is not None: for proglang in args.proglang: self.queries.append(self.__substring_query("plang", proglang)) if args.pagerange is not None: self.queries.append(self.__pages_query(args.pagerange)) if args.target is not None: # confusion: in the db, advanced is encoded as "3" # --> blame JPolibox ;) target_error = """target should be: 0 (beginner), 1 (intermediate), 2 (advanced) or 3 (professional)""" assert args.target in (0, 1, 2, 3), target_error self.queries.append(self.__equals_query("target", args.target)) if args.exercises is not None: exercises_error = """exercises value should be either 0 (books should have no exercises) or 1 (book should have exercises)""" assert args.exercises in (0, 1), exercises_error self.queries.append(self.__equals_query("exercises", args.exercises)) if args.codeexamples is not None: codeexamples_error = """codeexamples value should be either 0 (books should have no code examples) or 1 (book should have code examples)""" assert args.codeexamples in (0, 1), codeexamples_error self.queries.append(self.__equals_query("examples", args.codeexamples)) if args.minresults is not None: assert args.minresults > 0, """the minimal number of results must be 1""" self.minresults = args.minresults self.query_args = args # we still need them in pypolibox.main() self.and_query = self.__construct_query(self.queries, query_and) self.or_query = self.__construct_query(self.queries, query_or) def __construct_query(self, queries, query_combinator): """ helper function for __init__: takes a list of query arguments and combines them into one complex SQL query (using either boolean AND or boolean OR). :param queries: a list of queries in SQL notation :type queries: ``list`` of ``str`` :param query_combinator: a string that can be used to combine SQL queries, e.g. " AND " or " OR " :type query_combinator: ``str`` :return: a complex SQL query :rtype: ``str`` """ query_template = "SELECT * FROM books " where = "WHERE " combined_queries = "" if len(queries) > 1: for query in queries[:-1]: #combine queries with " AND ", but don't append #after the last query combined_queries += query + query_combinator combined_queries += queries[-1] return query_template + where + combined_queries elif len(queries) == 1: # simple query, no combination needed query = queries[0] # list with one string element --> string return query_template + where + query else: #empty query return query_template # query will show all books in the db def __pages_query(self, length_category): """ helper function for __init__: constructs a query for page ranges (e.g. the book should have less than 300 pages or between 300 and 600 pages). :param length_category: an integer specifying the page range of the book (0: short, 1: medium length, 2: long) :type length_category: ``int`` :return: a part of a simple SQL query, e.g. 'pages < 300' :rtype: ``str`` """ length_error = """length value should be either 0: short, 1: medium length or 2: long""" assert length_category in (0, 1, 2), length_error if length_category == 0: return "pages < 300" if length_category == 1: return "pages >= 300 AND pages < 600" if length_category == 2: return "pages >= 600" def __substring_query(self, sql_column, substring): """ helper function for __init__: our database has a strange format that combines several values in the same string. For example, under the key 'keywords', there could be a value such as '[semantics][parsing][phonology]'. Therefore, we'll need to query for substrings, e.g. to find a book about 'semantics'. :param sql_column: the name of the column in the database we're querying, e.g. 'keywords' :type sql_column: ``str`` :param substring: a string we're looking for, e.g. 'semantics' :type substring: ``str`` :return: a part of a simple SQL query, e.g. 'keyword like %semantics%' :rtype: ``str`` """ # keyword --> '%keyword%' for SQL LIKE queries sql_substring = "'%{0}%'".format(substring) substring_query = "{0} like {1}".format(sql_column, sql_substring) return substring_query def __string_query(self, sql_column, string): """ helper function for __init__: find all database items that completely match a string in a given column, e.g. WHERE lang = 'German' :param sql_column: the name of the column in the database we're querying, e.g. 'lang' :type sql_column: ``str`` :param string: a string we're looking for, e.g. 'German' :type string: ``str`` :return: a part of a simple SQL query, e.g. 'keyword like %semantics%' :rtype: ``str`` """ return "{0} = '{1}'".format(sql_column, string) def __equals_query(self, sql_column, integer): """ helper function for __init__: find all database items that completely match an integer value in a given column, e.g. WHERE exercises = 1 :return: a part of a simple SQL query, e.g. 'exercises = 1' :rtype: ``str`` """ return "{0} = {1}".format(sql_column, integer) def __str__(self): """ If you print a ``Query`` instance, it will return the query strings that will be send to the database. """ ret_str = "The arguments (parsed from the command line): " + \ "{0}\nhave resulted in the following SQL query:".format(self.query_args) + \ "\n{0}\n\nIf the query should return less than ".format(self.and_query) + \ "{0} book(s), this query will be used and ranked ".format(self.minresults) + \ "according to the number of query parameter matches:\n{0}".format(self.or_query) return ret_str
[docs]class Results: """ A ``Results`` instance sends queries to the database, retrieves and stores the results. """ def __init__ (self, query): """ initialises a connection to the db, sends queries and stores results in self.query_results If the query (combining query parameters with boolean AND) returns less than query.minresults books, a different query will be sent (combining query parameters with boolean OR). In the latter case, a maximum score (possible_matches) will be calculated (how many query parameters does a result match). possible_matches will be used by a ``Books`` instance to find the n-best matching books. :type query: instance of class ``Query`` :param query: an instance of the class Query() """ self.and_query_results = [] self.or_query_results = [] self.query_results = [] self.query_args = query.query_args self.and_query = query.and_query self.or_query = query.or_query self.minresults = query.minresults self.possible_matches = 0 conn = sqlite3.connect(DB_FILE) self.curs = conn.cursor() # NOTE: this has to be done BEFORE the actual query, # otherwise we'll overwrite the cursor! self.db_columns = self.get_table_header(BOOK_TABLE_NAME) and_sql_cursor = self.curs.execute(self.and_query) for result in and_sql_cursor: self.and_query_results.append(result) if len(self.and_query_results) >= self.minresults: self.possible_matches = self.get_number_of_possible_matches() self.query_results = self.and_query_results self.query_type = 'and' # if 'AND query' doesn't return enough results ... # TODO: this 'else block' only needs to be executed if the and_query # has too few results AND that query consists of more than one # parameter -- otherwise, it won't improve results. else: or_sql_cursor = self.curs.execute(query.or_query) for result in or_sql_cursor: self.or_query_results.append(result) self.possible_matches = self.get_number_of_possible_matches() self.query_results = self.or_query_results self.query_type = 'or' conn.close() # close connection to sqlite db
[docs] def get_number_of_possible_matches(self): """ Counts the number of query paramters that ``could`` be matched by books from the results set. The actual scoring of books takes place in ``Books.get_book_ranks()``. For example, if a query contains the parameters:: keywords = pragmatics, keywords = semantics, language = German it means that a book could possible match 3 parameters (possible_matches = 3). :return: the number of possible matches :rtype: ``int`` """ possible_matches = 0 self.params = [param for param in self.query_args.__dict__ if param is not 'minresults' if self.query_args.__getattribute__(param) is not None] self.values = map(self.query_args.__getattribute__, self.params) for value in self.values: if type(value) == list: possible_matches += len(value) else: possible_matches += 1 return possible_matches
[docs] def get_table_header(self, table_name): """ get the column names (e.g. title, year, authors) and their index from the books table of the db and return them as a dictionary. :param table_name: name of a database table, e.g. 'books' :type table_name: ``str`` :return: a dictionary, which contains the names of the table columns as keys and their index as values :rtype: ``dict``, with ``str`` keys and ``int`` values """ table_info = self.curs.execute('PRAGMA table_info({0})'.format(table_name)) db_columns = {} for index, name, data_type, notnull, dflt_value, pk in table_info: db_columns[name.encode(DEFAULT_ENCODING)] = index return db_columns
def __str__(self): """ prints the number of results and if boolean AND or boolean OR has been used to gather at least self.minresults number of books :rtype: ``str`` """ ret_str = "The query:\n{0}\n\nreturned ".format(self.and_query) + \ "{0} result(s):\n\n".format(len(self.and_query_results)) for book in self.and_query_results: ret_str += str(book) + "\n" if len(self.and_query_results) < self.minresults: ret_str += "\nLess than {0} queries were returned, ".format(self.minresults) + \ "therefore the query had to be rephrased:\n{0}\n".format(self.or_query) + \ "and returned these results:\n{0}".format(self.or_query_results) return ret_str
[docs]class Books: """ a ``Books`` instance stores ``all`` books that were found by a database query as a list of ``Book`` instances in ``self.books`` """ def __init__ (self, results): """ This constructor generates a list of ``Book`` instances (saved in ``self.books``), each representing one book retrieved from a database query. Additionally, this method will attach a score to each book (depending on the number of query parameters it matches) using the ``get_book_ranks()`` method. :param results: a ``Results`` instance containing the results from a database query :type results: ``Results`` """ # original query arguments for debugging self.query_args = results.query_args self.query_type = results.query_type self.books = [] sorted_books = [] for result in results.query_results: book_item = Book(result, results.db_columns, results.query_args) self.books.append(book_item) if self.query_type == 'and': #since all 'AND query' results match all query parameters, #their score will always be 1.0 self.scores = [1.0 for book in range(len(self.books))] elif self.query_type == 'or': book_ranks = self.get_book_ranks(results.possible_matches) for (score, index) in book_ranks: sorted_books.append( (self.books[index], score) ) #magic unzip / reverse zip function if len(sorted_books) == 0: self.books = [] self.scores = [] else: self.books, self.scores = zip(*sorted_books)
[docs] def get_book_ranks(self, possible_matches): """ ranks 'OR query' results according to the number of query parameters they match. Parameters ---------- possible_matches : int the number of (meaningful) parameters of the query. Returns ------- book_ranks : list of (float, int) tuples a list of tuples, where each tuple consists of the score of a book and its index in self.books """ scores = [] for index, book in enumerate(self.books): score = float(book.book_matches) / float(possible_matches) scores.append( (score, index) ) return sorted(scores, reverse=True) #best (highest) scores first
def __str__(self): """ prints the index, book score and database key value pairs about each book that the query returned. """ return_string = "" # since all 'AND query' results match all query parameters, # the score is always 1.0 if self.query_type == 'and': for index, book in enumerate(self.books): book_string = "index: {0}, score: 1.0\n{1}\n".format(index, book.__str__()) return_string += book_string return return_string elif self.query_type == 'or': for index, book in enumerate(self.books): book_string = "index: {0}, score: {1}\n{2}\n".format(index, self.scores[index], book.__str__()) return_string += book_string return return_string
[docs]class Book: """ a ``Book`` instance represents ``one`` book from a database query """ def __init__ (self, db_item, db_columns, query_args): """ Fills a ``Book`` instance with metadata from the database. Typical book metadata will look like this:: language: English title: Computational Linguistics. An Introduction. pagerange: 0 query_args: Namespace(codeexamples=None, exercises=None, keywords=None, language=None, minresults=2, pagerange=None, proglang=None, target=None) year: 1986 codeexamples: 0 proglang: set([]) exercises: 1 book_matches: 0 authors: set(['Ralph Grishman']) keywords: set(['generation', 'discourse', 'semantics', 'parsing']) pages: 193 All key value pairs from the database are encoded from unicode to UTF8 and the number of query parameters that a book matches is calculated via ``get_number_of_book_matches()``. :param db_item: an item from the ``sqlite3.Cursor`` object that contains the results from the db query. :type db_item: ``tuple`` :param db_columns: a dictionary of table columns (e.g. title, authors) from the database :type db_columns: ``dict`` :param query_args: a key/value store containing the original user query :type query_args: ``argparse.Namespace`` """ #needed for generating query facts later on self.query_args = query_args self.title = db_item[db_columns["title"]].encode(DEFAULT_ENCODING) self.year = db_item[db_columns["year"]] authors_array = db_item[db_columns["authors"]].encode(DEFAULT_ENCODING) self.authors = util.sql_array_to_set(authors_array) keywords_array = db_item[db_columns["keywords"]].encode(DEFAULT_ENCODING) self.keywords = util.sql_array_to_set(keywords_array) self.language = db_item[db_columns["lang"]].encode(DEFAULT_ENCODING) proglang_array = db_item[db_columns["plang"]].encode(DEFAULT_ENCODING) self.proglang = util.sql_array_to_set(proglang_array) self.pages = db_item[db_columns["pages"]] if self.pages < 300: self.pagerange = 0 elif self.pages >= 300 and self.pages < 600: self.pagerange = 1 elif self.pages >= 600: self.pagerange = 2 self.target = db_item[db_columns["target"]] self.exercises = db_item[db_columns["exercises"]] self.codeexamples = db_item[db_columns["examples"]] self.book_matches = self.get_number_of_book_matches()
[docs] def get_number_of_book_matches(self): """ calculates the number of query parameters that a book matches :rtype: ``int`` """ book_matches = 0 simple_attributes = ['codeexamples', 'exercises', 'language', 'pagerange', 'target'] # may contain more than 1 value complex_attributes = ['keywords', 'proglang'] for simple_attrib in simple_attributes: if self.query_args.__getattribute__(simple_attrib) == getattr(self, simple_attrib): book_matches += 1 for complex_attrib in complex_attributes: if self.query_args.__getattribute__(complex_attrib) is not None: for value in self.query_args.__getattribute__(complex_attrib): if value in getattr(self, complex_attrib): book_matches += 1 return book_matches
def __str__(self): """ prints the book metadata gathered from the database """ return_string = "" for key, value in self.__dict__.iteritems(): return_string += "{0}:\t\t{1}\n".format(key, value) return return_string
[docs]def get_column(column_name): """ debugging: primitive db query that returns all the values stored in a column, e.g. get_column("title") will return all book titles stored in the database :type column_name: ``str`` :rtype: ``list`` of ``str`` """ conn = sqlite3.connect(DB_FILE) curs = conn.cursor() col_curs = curs.execute("PRAGMA table_info({0})".format(BOOK_TABLE_NAME)) columns = [header[1] for header in col_curs] #print "available table columns: {0}\n".format(columns) results_cursor = curs.execute("select {0} from books".format(column_name)) results = [result[0] for result in results_cursor] conn.close() # close connection to db return results