#!/usr/bin/env python
#
# Script to do bulk querying by reading every row. This is useful in
# situations where the conditions are rare in a very large relation.
# When doing a row at a time, the selected columns are returned as a
# dict which can be fed into the write query. The write query needs to
# follow postgresql parameter naming convention, namely '%(column)s'
# is used. When using a LIKE clause with a '%', you must duplicate the
# percent as '%%'. The column must appear in the read query. It is
# important to remember that an 'id' column must be available from the
# read query and use AS if necessary. When doing --batch-ids, the
# parameters are not fed back into the write query since we are
# aggregating ids.
#
# Examples:
#
#  For every user move their user_properties 'birthday' into users.
#  for-every-row --no-op -v -b 1000 --d justintv_dev --read-query="select users.id as id, up.val as birthday from users join user_properties up on users.id = up.user_id and up.name='birthday' where users.birthday is null" --write-query="upadte users set birthday = %(birthday)s where id = %(id)s" users
#
#  Do a batch process to delete user_email_properties which do not join to users.
#  for-every-row --no-op -B -v -d justintv_dev --read-query "SELECT user_email_properties.id AS id FROM user_email_properties LEFT JOIN users u ON user_email_properties.user_id = u.id WHERE u.id IS NULL" --write-query "DELETE FROM user_email_properties WHERE id = any( %(ids)s )" user_email_properties
#
#  Do a batch process to delete short banned words or words that contain ***.
#  Smaller batch size chosen for safety around the extra work of LIKE. Note the double %.
#  for-every-row --no-op -B -v --batch-size 100 -d chat_depot --read-query "SELECT id FROM banned_words WHERE (octet_length(word) <= 2 OR word LIKE '%%***%%')" --write-query "DELETE FROM banned_words WHERE id = any( %(ids)s )" banned_words

def parse_args():
    import argparse
    parser = argparse.ArgumentParser(description="Process a very large relation by reading all rows.")
    parser.add_argument('relation', help="The primary relation that we will be reading.")
    parser.add_argument('--read-query', '-r', default=None,
                        help=("A generic read query will be generated for you if one is not provided here. "
                              "You can specify a custom query here. Be sure to read 'id' since that will be used "
                              "for iteration. A WHERE clause is assumed to be in the supplied query. The query "
                              "is amended by appending an AND clause with the id range so be sure to put "
                              "parentheses around statements with an OR condition."))
    parser.add_argument('--write-query', '-w', default=None,
                        help=("A query to run with the row column values found in the read-query. If not provided, "
                              "the write-query will be read from stdin."))
    parser.add_argument('--start', '-S', type=long, default=0, help="Set the starting id for reading the relation.")
    parser.add_argument('--batch-size', '-b', type=int, default=1000,
                        help="Set batch size when iterating over the relation.")
    parser.add_argument('--batch-ids', '-B', action='store_true', default=False,
                        help="Collect the ids and pass to write-query as an array of 'ids'.")
    parser.add_argument('--no-op', dest='do_it', action='store_false', default=True,
                        help="Do not actually perform the write operation.")
    parser.add_argument('--sleep', '-s', type=float, default=0.0, help="How long to sleep between loops.")
    parser.add_argument('--verbose', '-v', action='store_true', default=False, help="Print status as we go.")
    parser.add_argument('--max-writes', '-m', dest='max_writes', type=int, default=None, help=("The maximum number"
                        "of rows to write before exiting (modulo batch size). Proceed to completion if not specified."))
    parser.add_argument('--id-column', '-i', default='id')
    parser.add_argument('--host', '-H', default='/var/run/postgresql')
    parser.add_argument('--port', '-P', type=int, default=5432)
    parser.add_argument('--user', '-u', default='postgres')
    parser.add_argument('--pass', '-p', dest='passwd', default='')
    parser.add_argument('--db', '-d', default='')
    return parser.parse_args()


def _find_range(cc, relation, id):
    cc.execute("select min({0}), max({0}) from {1}".format(id, relation))
    (min, max) = cc.fetchall()[0]
    return long(min), long(max)


def _build_read_query(read_query, relation, id):
    if read_query is None:
        return ("select {id} from {relation} where {id} >= %(start)s and {id} < %(end)s "
                "/* for-every-row default read */").format(**locals())
    suffix = "and ( {0}.{1} >= %(start)s and {0}.{1} < %(end)s ) /* for-every-row reader */".format(relation, id)
    return ' '.join([read_query, suffix])


def _is_under_write_limit(rows_processed, max_writes):
    """
    Return whether or not the given parameters semantically represent a state
    where we haven't reached the specified limit of writes. None for max_writes
    always returns true
    """
    return max_writes is None or rows_processed < max_writes


def _process_batch_rows(rc, wc, query, do_it, verbose):
    rows = rc.fetchall()
    ids = {'ids': [long(row[0]) for row in rows]}
    rows_processed = 0
    if verbose:
        print("execute: {0}".format(ids))
    if do_it:
        wc.execute(query, ids)
        rows_processed += wc.rowcount
    return rows_processed


def _process_every_row(rc, wc, query, do_it, verbose):
    # the row here is a dictcursor
    row = rc.fetchone()
    rows_processed = 0
    while row:
        if verbose:
            print("execute: {0}".format(row))
        if do_it:
            wc.execute(query, row)
            rows_processed += wc.rowcount
        row = rc.fetchone()
    return rows_processed


def main():
    "Program entry point"
    args = parse_args()
    import psycopg2
    import time
    db = psycopg2.connect(host=args.host, port=args.port, user=args.user, password=args.passwd, database=args.db)
    db.autocommit = True
    wc = db.cursor()
    first, last = _find_range(wc, args.relation, args.id_column)
    if not args.batch_ids:
        # Keep the default cursor in the case of batching. Use cursor_factory once psycopg 2.5 is available.
        from psycopg2.extras import RealDictConnection
        db = RealDictConnection(db.dsn)
        db.autocommit = True
        wc = db.cursor()
    rc = db.cursor()
    start = max(first, args.start)
    read_query = _build_read_query(args.read_query, args.relation, args.id_column)
    if args.write_query is None:
        import sys
        write_query = sys.stdin.read().strip()
    else:
        write_query = args.write_query + " /* for-every-row writer */"
    if args.verbose:
        print(read_query)
        print(write_query)
        print("Processing as many {0} rows (until id={1}) in {2}.".format(last - start, last, args.relation))
    rows_processed = 0
    while start <= last and _is_under_write_limit(rows_processed, args.max_writes):
        end = start + args.batch_size
        rc.execute(read_query, {'start': start, 'end':end})
        if rc.rowcount > 0:
            if args.batch_ids:
                rows_processed += _process_batch_rows(rc, wc, write_query, args.do_it, args.verbose)
            else:
                rows_processed += _process_every_row(rc, wc, write_query, args.do_it, args.verbose)
        if args.verbose:
            print("next id: {0}".format(end))
        time.sleep(args.sleep)
        start = end

if __name__ == '__main__':
   main()
