# =============================================================================
# aggregation.py - A Python module for database aggregation functions
# Copyright (C) 2000, 2001 Ole Moller Nielsen & Peter Christen
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License (http://www.gnu.org/copyleft/gpl.html)
#    for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
#
#    
# Contact address: Ole.Nielsen@bigfoot.com
#                  Peter.Christen@anu.edu.au
#
# Version 1.0 March 2001
# =============================================================================


# -----------------------------------------------------------------------------

def standard_breakdown(attribute1, attribute2, tables, selectors=None, 
                       caching=1, verbose=0):
  """Returns a dictionary of attribute1 values with attribute2 values in lists.

  USAGE:
    result_dict = standard_breakdown(attribute1, attribute2, tables,
                                     selectors, caching, verbose)

  ARGUMENTS:
    attribute1 -- Attribute for which a dictionary is built. (Required)
    attribute2 -- Attribute which is listed for each occurrence of attribute1.
                  (Required)
    tables --     List of conforming tables. (Required)
    selectors --  Selectors, see documentation for 'selectors_to_clause'.
                  If 'selectors' is of type string, it is directly used for
                  the SQL query.
                  (Default: None)
    caching --    Enable or disable caching of query results. (Default: 1)
    verbose --    Verbose mode. (Default: 0)

  DESCRIPTION:
    A function to get all occurrences of 'attribute2' for each distinct value
    of 'attribute1' from the (conforming) database tables where the criteria
    defined by the list of (optional) selectors is met (using conjunction
    AND). 'attribute1' and 'attribute2' must be valid attribute names.
    See 'selector_to_clause' for documentation on selectors.

  EXAMPLE:
    The calls
      tables = ['1997','1998','1999']
      result_dict = standard_breakdown('COMPANY', 'CUSTOMER', tables)
    return for each COMPANY a list of all their CUSTOMERS over the given three
    years period. Each COMPANY name is a key of the result_dict.
  """

  # Fixme:
  #
  # Check that tables exist and conform.
  # Check that 'attribute1' and 'attribute2' are valid attributes.

  # Idea:
  #
  # Perhaps 'attribute2' could a list of attributes?

  from caching import cache
  import database
  import types

  if not type(tables) in [types.ListType, types.TupleType]:
    tables = [tables]

  # Convert selectors to SQL clause
  #
  clause = ''
  if selectors:
    if type(selectors) == types.StringType:
      clause = selectors  # Pass clause directly on to query
    else:
      clause = selector_to_clause(selectors)  # Convert to clause

  # Idea:
  #
  # We should try to parallelise this, but the problems are dependencies
  # (within multi_query)?

  main_dict = {}  # Dictionary of all distinct values of 'attribute1'

  for table in tables:
    # Build query
    #
    query = 'select ' + attribute1 + ', ' + attribute2 + ' from '
    query = query + table
    if clause:
      query = query + ' where ' + clause
    query = query + ' group by ' + attribute1 + ', ' + attribute2

    basename = database.options['database_dir']+table
    deps = (basename+'.frm', basename+'.ISD', basename+'.ISM')
    A = cache(database.exec_query, query, dependencies=deps, verbose=verbose,
              evaluate=not caching)

    # Sum up number of transactions from each table
    #
    for record in A:
      attribute1val = record[0]
      attribute2val = record[1]

      if not main_dict.has_key(attribute1val):
        main_dict[attribute1val] = {}  # Create an empty dictionary

      sub_dict = main_dict[attribute1val] # Dictionary for secondary field

      if not sub_dict.has_key(attribute2val):
        sub_dict[attribute2val] = 0  # Register occurrence of attribute2

  main_keys = main_dict.keys()  # Convert sub_dict's int lists
  for key in main_keys:
    sub_dict = main_dict[key]
    sub_list = sub_dict.keys()
    sub_list.sort()
    main_dict[key] = sub_list  # Return sorted keys in list

  return(main_dict)

# -----------------------------------------------------------------------------

def standard_count(attribute1, attribute2, tables, selectors=None, caching=1,
                   verbose=0):
  """Returns a dict. of attribute1 values with numbers of attribute2 values.

  USAGE:
    result_dict = standard_count(attribute1, attribute2, tables,
                                 selectors, caching, verbose)

  ARGUMENTS:
    attribute1 -- Attribute for which a dictionary is built. (Required)
    attribute2 -- Attribute which is counted for each occurrence of attribute1.
                  (Required)
    tables --     List of conforming tables. (Required)
    selectors --  Selectors, see documentation for 'selectors_to_clause'.
                  If 'selectors' is of type string, it is directly used for
                  the SQL query.
                  (Default: None)
    caching --    Enable or disable caching of query results. (Default: 1)
    verbose --    Verbose mode. (Default: 0)

  DESCRIPTION:
    A function to count all occurrences of 'attribute2' for each distinct value
    of 'attribute1' from the (conforming) database tables where the criteria
    defined by the list of (optional) selectors are met (using conjunction
    AND). 'attribute1' and 'attribute2' must be valid attribute names, except
    that 'attribute2' can be a '*' in which case the number of transactions
    for value of 'attribute1' are counted. This function is based on
    'standard_breakdown' and 'count_transactions'.
    See 'selector_to_clause' for documentation on selectors.

  EXAMPLE:
    The calls
      tables = ['1997','1998','1999']
      result_dict = standard_count('COMPANY', 'CUSTOMER',tables)
    return for each COMPANY the number of different CUSTOMERS over the given
    three years period. Each COMPANY name is a key of the result_dict.
    The calls
      tables = ['1997','1998','1999']
      sel = [('CODE', 'xyz')]
      result_dict = standard_count('COMPANY', '*', tables, selectors=sel)
    return for each COMPANY the number of transactions over the three years
    period where CODE='xyz'.
  """

  if attribute2 == '*':
    R = count_transactions(attribute1, tables, selectors, caching, verbose)
    return(R)

  main_dict = standard_breakdown(attribute1, attribute2, tables, selectors, \
                                 caching, verbose)
  count_dict = {}
  for key in main_dict.keys():  # Count number of occurrences
    count_dict[key] = len(main_dict[key])

  return(count_dict)

#==============================================================================
# Auxiliary functions
#==============================================================================

def count_transactions(attribute, tables, selectors=None, caching=1,
                       verbose=0):
  """Return dictionary of attribute values with transaction counts.

  USAGE:
    result_dict = count_transactions(attribute, tables, selectors,
                                     caching, verbose)

  ARGUMENTS:
    attribute -- Attribute for which a dictionary is built. (Required)
    tables --    List of conforming tables. (Required)
    selectors -- Selectors, see documentation for 'selectors_to_clause'.
                 If 'selectors' is of type string, it is directly used for
                 the SQL query.
                 (Default: None)
    caching --   Enable or disable caching of query results. (Default: 1)
    verbose --   Verbose mode. (Default: 0)

  DESCRIPTION:
    Function to get all distinct values of 'attribute' from the (conforming)
    database tables and their corresponding number of transactions where the
    criteria defined by the list of selectors is met. If selectors is absent
    or None, all distinct values will be returned.
  """

  from caching import cache
  import database
  import types

  if not type(tables) in [types.ListType, types.TupleType]:
    tables = [tables]

  # Convert selectors to SQL clause
  #
  clause = ''
  if selectors:
    if type(selectors) == types.StringType:
      clause = selectors  # Pass conditions on to query
    else:
      clause = selector_to_clause(selectors)

  # Query database
  #
  count_dict = {}
  for table in tables:

    # Build query
    #
    query = 'select ' + attribute + ', count(*) from '
    query = query + table
    if clause: query = query + ' where ' + clause
    query = query + ' group by ' + attribute

    # Execute query
    #
    basename = database.options['database_dir']+table
    deps = (basename+'.frm', basename+'.ISD', basename+'.ISM')
    A = cache(database.exec_query, query, dependencies=deps, verbose=verbose, \
              evaluate=not caching)

    # Sum up number of transactions from each table
    #
    for record in A:
      val   = record[0]
      count = record[1]

      if count_dict.has_key(val):
        count = count_dict[val] + count

      count_dict[val] = count

  return(count_dict)  
  
# -----------------------------------------------------------------------------

def selector_to_clause(selectors, verbose=0):
  """Converts a selector into an SQL clause.

  USAGE:
    clause = selector_to_clause(selectors, verbose)

  ARGUMENTS:
    selectors -- A list of selectors. (Required)
    verbose --   Verbose mode. (Default: 0)

  DESCRIPTION:
    The syntax of a selector is: selector = (attribute, match_set) where
    'match_set' is a list of values that can be attained by the attribute given
    in selector. Entries in 'match_set' can also be a list of length 2,
    signifying all values between the two values. Example:
      match_set = ['69333', '71147', ['73901', '73921']]
    matches all values x, where x=69633, x=71147, or where 73901<=x<=73912.
    If selectors is absent or 'None', all distinct values will be returned.

  EXAMPLE:
    The list of selectors
      sel = [('YEAR', 1997), ('TYPE',['a','c','g']),('QUARTER', [[1,3]])]
    is expanded into the clause:
      clause = '(YEAR="1997") and (TYPE="a" or TYPE="c" or TYPE="g") and 
                (QUARTER between "1" and "3")'
    which can then be used for SQL queries.
  """

  import types

  if type(selectors) != types.ListType:
    selectors = [selectors]

  clause = ''
  n = 0
  for selector in selectors:
    if not type(selector) == types.TupleType:
      if verbose:
        print 'ERROR: (selector_to_clause): selector must be either a ',
        print 'tuple or a list of tuples of the form (attribname,list).'
      raise TypeError(selector)

    (attrib, match_set) = selector
    if not type(match_set) in [types.ListType, types.TupleType]:
      match_set = [match_set]

    clause = clause + form_criterion(attrib, match_set, verbose=verbose)
    if n < len(selectors)-1:
      clause = clause + ' and '
    n = n+1
  return(clause)

# -----------------------------------------------------------------------------

def form_criterion(attribute, items, verbose=0):
  """Create an SQL criterion given an attribute and a list of items.

  USAGE:
    criterion = form_criterion(attribute, items, verbose)

  ARGUMENTS:
    attribute -- A valid table attribute. (Required)
    items --     A list of items. (Required)
    verbose --   Verbose mode. (Default: 0)

  DESCRIPTION:
    See documentation for 'selector_to_clause'.
  """
 
  import types

  criterion = '('
  if (attribute and items):
    for l in range(len(items)):
      if l > 0:
        criterion = criterion + ' or '
      item = items[l]
      criterion = criterion + attribute

      if type(item) == types.ListType:
        if len(item) == 2:
          criterion = criterion + ' between "' + str(item[0]) + \
                      '" and "' + str(item[1]) + '"'
        else:
          if verbose:
            print 'ERROR (Form_criterion): Singletons or pairs only'
          raise TypeError(item)
      else:
        criterion = criterion + '="' + str(item) + '"'

  criterion = criterion+')'

  return(criterion)

# -----------------------------------------------------------------------------
