#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Arne Neumann <arne-neumann@web.de>
"""
The ``database`` module is responsible for parsing the user's requirements
(both from command line options, as well as interactively from the Python
interpreter), transforming these requirements into an SQL query, querying the
sqlite database and returning the results.
"""
import os
import argparse
import sqlite3
import util
if __name__ == '__main__':
DB_FILE = 'data/books.sqlite'
else:
DB_FILE = os.path.join(os.path.dirname(__file__),'data/books.sqlite')
BOOK_TABLE_NAME = 'books' # name of the table in the database file that
# contains info about books
DEFAULT_ENCODING = 'UTF8'
[docs]class Query:
"""
a ``Query`` instance represents one user query to the database
Queries can be made from the command line, as well as from the Python
interpreter. From the command line, queries can be made using either
abbreviated or long parameters. The following examples both query the
database for books that contain code examples and deal with both semantics
and parsing::
python pypolibox.py -k semantics, parsing -c 1
python pypolibox.py --keywords semantics, parsing --codeexamples 1
When calling ``pypolibox.py`` from within the Python interpreter, the same
query can be made using the following command::
Query(["-k", "semantics", "parsing", "-c", "1"])
If you print the ``Query`` instance (by using the ``print`` command), it
will return the SQL query that was constructed from the user input::
SELECT * FROM books WHERE keywords like '%semantics%' AND keywords
like '%parsing%' AND examples = 1
TODO: This module talks directly to the database. To make it easier to
adapt pypolibox to a different domain, an SQL abstraction layer (e.g.
SQL Alchemy) should be used.
"""
def __init__ (self, argv):
"""
given a list of query arguments, this constructor parses commandline
options with argparse, constructs a valid sql query and stores the
resulting query strings in self.and_query (using boolean AND to
combine the query arguments) and self.or_query (boolean OR).
TODO: add max_textplans paramter --> generate only the X highest
ranking books
:param argv: a list of strings (either parsed from the command line
or set programmatically)
:type argv: ``list`` of ``str``
"""
self.queries = []
self.minresults = 3
query_and = " AND "
query_or = " OR "
parser = argparse.ArgumentParser()
parser.add_argument("-k", "--keywords", nargs='+',
help="Which topic(s) should the book cover?")
#nargs='+' handles 1 or more args
parser.add_argument("-l", "--language",
help="Which language should the book have?")
parser.add_argument("-p", "--proglang", nargs='+',
help="Which programming language(s) should the book use?")
parser.add_argument("-s", "--pagerange", type=int,
help="book length ranges. 0 = less than 300 pages, " \
"1 = between 300 and 600 pages. 2 = more than 600 pages.")
parser.add_argument("-t", "--target", type=int,
help="target audience. 0 = beginner, 1 = intermediate" \
"2 = advanced, 3 = professional")
parser.add_argument("-e", "--exercises", type=int,
help="Should the book contain exercises? 0 = no, 1 = yes")
parser.add_argument("-c", "--codeexamples", type=int,
help="Should the book contain code examples? 0 = no, 1 = yes")
parser.add_argument("-r", "--minresults", type=int,
help="show no less than MINRESULTS books")
parser.add_argument("-o", "--output-format",
default='openccg',
help=("output format: openccg, hlds, textplan-xml, textplan-featstruct. "
"default: openccg"))
parser.add_argument("-d", "--output-language",
default='de',
help=("output natural language: currently only 'de' for German is supported. "
"default: de"))
args = parser.parse_args(argv)
if args.keywords is not None:
for keyword in args.keywords:
self.queries.append(self.__substring_query("keywords",
keyword))
if args.language is not None:
self.queries.append(self.__string_query("lang", args.language))
if args.proglang is not None:
for proglang in args.proglang:
self.queries.append(self.__substring_query("plang", proglang))
if args.pagerange is not None:
self.queries.append(self.__pages_query(args.pagerange))
if args.target is not None:
# confusion: in the db, advanced is encoded as "3"
# --> blame JPolibox ;)
target_error = """target should be: 0 (beginner), 1
(intermediate), 2 (advanced) or 3 (professional)"""
assert args.target in (0, 1, 2, 3), target_error
self.queries.append(self.__equals_query("target", args.target))
if args.exercises is not None:
exercises_error = """exercises value should be either 0 (books
should have no exercises) or 1 (book should have exercises)"""
assert args.exercises in (0, 1), exercises_error
self.queries.append(self.__equals_query("exercises",
args.exercises))
if args.codeexamples is not None:
codeexamples_error = """codeexamples value should be either 0
(books should have no code examples) or 1 (book should have code
examples)"""
assert args.codeexamples in (0, 1), codeexamples_error
self.queries.append(self.__equals_query("examples",
args.codeexamples))
if args.minresults is not None:
assert args.minresults > 0, """the minimal number of results must
be 1"""
self.minresults = args.minresults
self.query_args = args # we still need them in pypolibox.main()
self.and_query = self.__construct_query(self.queries, query_and)
self.or_query = self.__construct_query(self.queries, query_or)
def __construct_query(self, queries, query_combinator):
"""
helper function for __init__: takes a list of query arguments and
combines them into one complex SQL query (using either boolean AND or
boolean OR).
:param queries: a list of queries in SQL notation
:type queries: ``list`` of ``str``
:param query_combinator: a string that can be used to combine SQL
queries, e.g. " AND " or " OR "
:type query_combinator: ``str``
:return: a complex SQL query
:rtype: ``str``
"""
query_template = "SELECT * FROM books "
where = "WHERE "
combined_queries = ""
if len(queries) > 1:
for query in queries[:-1]:
#combine queries with " AND ", but don't append
#after the last query
combined_queries += query + query_combinator
combined_queries += queries[-1]
return query_template + where + combined_queries
elif len(queries) == 1: # simple query, no combination needed
query = queries[0] # list with one string element --> string
return query_template + where + query
else: #empty query
return query_template # query will show all books in the db
def __pages_query(self, length_category):
"""
helper function for __init__: constructs a query for page ranges
(e.g. the book should have less than 300 pages or between 300 and 600
pages).
:param length_category: an integer specifying the page range of the
book (0: short, 1: medium length, 2: long)
:type length_category: ``int``
:return: a part of a simple SQL query, e.g. 'pages < 300'
:rtype: ``str``
"""
length_error = """length value should be either 0: short, 1: medium
length or 2: long"""
assert length_category in (0, 1, 2), length_error
if length_category == 0:
return "pages < 300"
if length_category == 1:
return "pages >= 300 AND pages < 600"
if length_category == 2:
return "pages >= 600"
def __substring_query(self, sql_column, substring):
"""
helper function for __init__: our database has a strange format that
combines several values in the same string. For example, under the
key 'keywords', there could be a value such as
'[semantics][parsing][phonology]'. Therefore, we'll need to query for
substrings, e.g. to find a book about 'semantics'.
:param sql_column: the name of the column in the database we're
querying, e.g. 'keywords'
:type sql_column: ``str``
:param substring: a string we're looking for, e.g. 'semantics'
:type substring: ``str``
:return: a part of a simple SQL query, e.g. 'keyword like %semantics%'
:rtype: ``str``
"""
# keyword --> '%keyword%' for SQL LIKE queries
sql_substring = "'%{0}%'".format(substring)
substring_query = "{0} like {1}".format(sql_column, sql_substring)
return substring_query
def __string_query(self, sql_column, string):
"""
helper function for __init__: find all database items that completely
match a string in a given column, e.g. WHERE lang = 'German'
:param sql_column: the name of the column in the database we're
querying, e.g. 'lang'
:type sql_column: ``str``
:param string: a string we're looking for, e.g. 'German'
:type string: ``str``
:return: a part of a simple SQL query, e.g. 'keyword like %semantics%'
:rtype: ``str``
"""
return "{0} = '{1}'".format(sql_column, string)
def __equals_query(self, sql_column, integer):
"""
helper function for __init__: find all database items that completely
match an integer value in a given column, e.g. WHERE exercises = 1
:return: a part of a simple SQL query, e.g. 'exercises = 1'
:rtype: ``str``
"""
return "{0} = {1}".format(sql_column, integer)
def __str__(self):
"""
If you print a ``Query`` instance, it will return the query strings
that will be send to the database.
"""
ret_str = "The arguments (parsed from the command line): " + \
"{0}\nhave resulted in the following SQL query:".format(self.query_args) + \
"\n{0}\n\nIf the query should return less than ".format(self.and_query) + \
"{0} book(s), this query will be used and ranked ".format(self.minresults) + \
"according to the number of query parameter matches:\n{0}".format(self.or_query)
return ret_str
[docs]class Results:
"""
A ``Results`` instance sends queries to the database, retrieves and stores
the results.
"""
def __init__ (self, query):
"""
initialises a connection to the db, sends queries and stores results
in self.query_results
If the query (combining query parameters with boolean AND) returns
less than query.minresults books, a different query will be sent
(combining query parameters with boolean OR). In the latter case, a
maximum score (possible_matches) will be calculated (how many query
parameters does a result match). possible_matches will be used by
a ``Books`` instance to find the n-best matching books.
:type query: instance of class ``Query``
:param query: an instance of the class Query()
"""
self.and_query_results = []
self.or_query_results = []
self.query_results = []
self.query_args = query.query_args
self.and_query = query.and_query
self.or_query = query.or_query
self.minresults = query.minresults
self.possible_matches = 0
conn = sqlite3.connect(DB_FILE)
self.curs = conn.cursor()
# NOTE: this has to be done BEFORE the actual query,
# otherwise we'll overwrite the cursor!
self.db_columns = self.get_table_header(BOOK_TABLE_NAME)
and_sql_cursor = self.curs.execute(self.and_query)
for result in and_sql_cursor:
self.and_query_results.append(result)
if len(self.and_query_results) >= self.minresults:
self.possible_matches = self.get_number_of_possible_matches()
self.query_results = self.and_query_results
self.query_type = 'and'
# if 'AND query' doesn't return enough results ...
# TODO: this 'else block' only needs to be executed if the and_query
# has too few results AND that query consists of more than one
# parameter -- otherwise, it won't improve results.
else:
or_sql_cursor = self.curs.execute(query.or_query)
for result in or_sql_cursor:
self.or_query_results.append(result)
self.possible_matches = self.get_number_of_possible_matches()
self.query_results = self.or_query_results
self.query_type = 'or'
conn.close() # close connection to sqlite db
[docs] def get_number_of_possible_matches(self):
"""
Counts the number of query paramters that ``could`` be matched by books
from the results set. The actual scoring of books takes place in
``Books.get_book_ranks()``.
For example, if a query contains the parameters::
keywords = pragmatics, keywords = semantics, language = German
it means that a book could possible match 3 parameters
(possible_matches = 3).
:return: the number of possible matches
:rtype: ``int``
"""
possible_matches = 0
self.params = [param for param in self.query_args.__dict__
if param is not 'minresults'
if self.query_args.__getattribute__(param) is not None]
self.values = map(self.query_args.__getattribute__, self.params)
for value in self.values:
if type(value) == list:
possible_matches += len(value)
else:
possible_matches += 1
return possible_matches
def __str__(self):
"""
prints the number of results and if boolean AND or boolean OR has
been used to gather at least self.minresults number of books
:rtype: ``str``
"""
ret_str = "The query:\n{0}\n\nreturned ".format(self.and_query) + \
"{0} result(s):\n\n".format(len(self.and_query_results))
for book in self.and_query_results:
ret_str += str(book) + "\n"
if len(self.and_query_results) < self.minresults:
ret_str += "\nLess than {0} queries were returned, ".format(self.minresults) + \
"therefore the query had to be rephrased:\n{0}\n".format(self.or_query) + \
"and returned these results:\n{0}".format(self.or_query_results)
return ret_str
[docs]class Books:
"""
a ``Books`` instance stores ``all`` books that were found by a database query
as a list of ``Book`` instances in ``self.books``
"""
def __init__ (self, results):
"""
This constructor generates a list of ``Book`` instances (saved in
``self.books``), each representing one book retrieved from a database
query. Additionally, this method will attach a score to each book
(depending on the number of query parameters it matches) using the
``get_book_ranks()`` method.
:param results: a ``Results`` instance containing the results from a
database query
:type results: ``Results``
"""
# original query arguments for debugging
self.query_args = results.query_args
self.query_type = results.query_type
self.books = []
sorted_books = []
for result in results.query_results:
book_item = Book(result, results.db_columns, results.query_args)
self.books.append(book_item)
if self.query_type == 'and':
#since all 'AND query' results match all query parameters,
#their score will always be 1.0
self.scores = [1.0 for book in range(len(self.books))]
elif self.query_type == 'or':
book_ranks = self.get_book_ranks(results.possible_matches)
for (score, index) in book_ranks:
sorted_books.append( (self.books[index], score) )
#magic unzip / reverse zip function
if len(sorted_books) == 0:
self.books = []
self.scores = []
else:
self.books, self.scores = zip(*sorted_books)
[docs] def get_book_ranks(self, possible_matches):
"""
ranks 'OR query' results according to the number of query parameters
they match.
Parameters
----------
possible_matches : int
the number of (meaningful) parameters of the query.
Returns
-------
book_ranks : list of (float, int) tuples
a list of tuples, where each tuple consists of the score of
a book and its index in self.books
"""
scores = []
for index, book in enumerate(self.books):
score = float(book.book_matches) / float(possible_matches)
scores.append( (score, index) )
return sorted(scores, reverse=True) #best (highest) scores first
def __str__(self):
"""
prints the index, book score and database key value pairs about each
book that the query returned.
"""
return_string = ""
# since all 'AND query' results match all query parameters,
# the score is always 1.0
if self.query_type == 'and':
for index, book in enumerate(self.books):
book_string = "index: {0}, score: 1.0\n{1}\n".format(index,
book.__str__())
return_string += book_string
return return_string
elif self.query_type == 'or':
for index, book in enumerate(self.books):
book_string = "index: {0}, score: {1}\n{2}\n".format(index,
self.scores[index],
book.__str__())
return_string += book_string
return return_string
[docs]class Book:
"""
a ``Book`` instance represents ``one`` book from a database query
"""
def __init__ (self, db_item, db_columns, query_args):
"""
Fills a ``Book`` instance with metadata from the database. Typical
book metadata will look like this::
language: English
title: Computational Linguistics. An Introduction.
pagerange: 0
query_args: Namespace(codeexamples=None, exercises=None,
keywords=None, language=None, minresults=2,
pagerange=None, proglang=None, target=None)
year: 1986
codeexamples: 0
proglang: set([])
exercises: 1
book_matches: 0
authors: set(['Ralph Grishman'])
keywords: set(['generation', 'discourse', 'semantics',
'parsing'])
pages: 193
All key value pairs from the database are encoded from unicode
to UTF8 and the number of query parameters that a book matches is
calculated via ``get_number_of_book_matches()``.
:param db_item: an item from the ``sqlite3.Cursor`` object that contains
the results from the db query.
:type db_item: ``tuple``
:param db_columns: a dictionary of table columns (e.g. title, authors)
from the database
:type db_columns: ``dict``
:param query_args: a key/value store containing the original user query
:type query_args: ``argparse.Namespace``
"""
#needed for generating query facts later on
self.query_args = query_args
self.title = db_item[db_columns["title"]].encode(DEFAULT_ENCODING)
self.year = db_item[db_columns["year"]]
authors_array = db_item[db_columns["authors"]].encode(DEFAULT_ENCODING)
self.authors = util.sql_array_to_set(authors_array)
keywords_array = db_item[db_columns["keywords"]].encode(DEFAULT_ENCODING)
self.keywords = util.sql_array_to_set(keywords_array)
self.language = db_item[db_columns["lang"]].encode(DEFAULT_ENCODING)
proglang_array = db_item[db_columns["plang"]].encode(DEFAULT_ENCODING)
self.proglang = util.sql_array_to_set(proglang_array)
self.pages = db_item[db_columns["pages"]]
if self.pages < 300:
self.pagerange = 0
elif self.pages >= 300 and self.pages < 600:
self.pagerange = 1
elif self.pages >= 600:
self.pagerange = 2
self.target = db_item[db_columns["target"]]
self.exercises = db_item[db_columns["exercises"]]
self.codeexamples = db_item[db_columns["examples"]]
self.book_matches = self.get_number_of_book_matches()
[docs] def get_number_of_book_matches(self):
"""
calculates the number of query parameters that a book matches
:rtype: ``int``
"""
book_matches = 0
simple_attributes = ['codeexamples', 'exercises', 'language',
'pagerange', 'target']
# may contain more than 1 value
complex_attributes = ['keywords', 'proglang']
for simple_attrib in simple_attributes:
if self.query_args.__getattribute__(simple_attrib) == getattr(self,
simple_attrib):
book_matches += 1
for complex_attrib in complex_attributes:
if self.query_args.__getattribute__(complex_attrib) is not None:
for value in self.query_args.__getattribute__(complex_attrib):
if value in getattr(self, complex_attrib):
book_matches += 1
return book_matches
def __str__(self):
"""
prints the book metadata gathered from the database
"""
return_string = ""
for key, value in self.__dict__.iteritems():
return_string += "{0}:\t\t{1}\n".format(key, value)
return return_string
[docs]def get_column(column_name):
"""
debugging: primitive db query that returns all the values stored in a
column, e.g. get_column("title") will return all book titles stored in
the database
:type column_name: ``str``
:rtype: ``list`` of ``str``
"""
conn = sqlite3.connect(DB_FILE)
curs = conn.cursor()
col_curs = curs.execute("PRAGMA table_info({0})".format(BOOK_TABLE_NAME))
columns = [header[1] for header in col_curs]
#print "available table columns: {0}\n".format(columns)
results_cursor = curs.execute("select {0} from books".format(column_name))
results = [result[0] for result in results_cursor]
conn.close() # close connection to db
return results