import sys
import datetime
import dateutil
import dateutil.parser as dateutil_parser
import logging
import time
import sqlalchemy as sa
import argparse
import csv

from urlparse import urlparse
from boto.s3.connection import S3Connection
from StringIO import StringIO
from gzip import GzipFile

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Float, Integer, BigInteger, Text, DateTime
from sqlalchemy.orm import sessionmaker
from sqlalchemy.schema import *

def bail(fmt, *args):
    logging.error(fmt, *args)
    sys.exit("aborting")


Base = declarative_base()
class Version(Base):
    __tablename__ = 'versions'
    id = Column(Integer, primary_key=True)
    time = Column(DateTime)

vba_score = Table('vba_score', Base.metadata,
    Column('version', Integer),
    Column('count', BigInteger),
    Column('category', Text),
    Column('dscore', BigInteger),
    Column('flock_score', BigInteger),
    Column('dscore_str', Text),
    Column('client_asn_id', BigInteger),
    Column('time', Float))

Index(vba_score.name + '_index',
    vba_score.c.client_asn_id,
    vba_score.c.version,
    unique=True)

def get_db_session(database_url):
    con = sa.create_engine(database_url)

    Base.metadata.create_all(con)
    Session = sessionmaker(bind=con)
    return Session()

def add_scores(session, new_version, data):
    if len(data) == 0:
        return
    for row in data:
        row["version"] = new_version
        #row["time"] = datetime.datetime.utcfromtimestamp(float(row["time"]))
        del row["client_asn"]

        # insert each row individually for now, on gvc staging i got:
        # CompileError: The 'sqlite' dialect with current database version settings does not support in-place multirow inserts.
        # ... and even locally it chokes eventually when there's too many values in one insert.
        session.execute(vba_score.insert().values(row))


def get_version_info(session):
    v = session.query(Version).order_by(sa.desc(Version.id.name)).limit(1).one_or_none()
    if v == None:
        new_version = 0
        latest_timestamp = None
    else:
        latest_version, latest_timestamp = v.id, v.time.replace(tzinfo=dateutil.tz.gettz('UTC'))
        new_version = latest_version + 1 if latest_version != None else 0
    return new_version, latest_timestamp

def get_from_s3(bucket, path_prefix, latest_timestamp=None):
    conn = S3Connection(host="s3.us-west-2.amazonaws.com")
    # pass validate=False to avoid checking the bucket's existence, which requires more S3 permissions than strictly necessary for us here.
    bucket = conn.get_bucket(bucket, validate=False)
    if len(path_prefix) > 0 and path_prefix[0] == '/':
        path_prefix = path_prefix[1:]
    keys = [(key, dateutil_parser.parse(key.last_modified))
            for key in bucket.list(prefix=path_prefix)]
    if not (latest_timestamp is None):
        keys = [(key, last_modified) for key, last_modified in keys
                if last_modified > latest_timestamp]
    keys.sort(key=lambda p: p[1])
    for key, last_modified in keys:
        f = StringIO()
        key.get_file(fp=f)
        f.seek(0)
        if key.name[-3:] == '.gz':
            yield key.name, GzipFile(fileobj=f, mode="r"), last_modified
        else:
            yield key.name, f, last_modified

def fetch_data(latest_timestamp, input_file, input_s3):
    if (input_file != None) == (input_s3 != None):
        bail("Must give exactly one of --input-file or --input-s3")

    data = None
    versions = None
    if input_file != None:
        logging.info("Reading data from %s", input_file)
        new_timestamp = datetime.datetime.utcnow()
        data_file = open(input_file)
        versions = [input_file, data_file, new_timestamp]
    elif input_s3 != None:
        url = urlparse(input_s3)
        logging.info("Searching for data at s3://%s%s*", url.hostname, url.path)
        versions = get_from_s3(
            bucket=url.hostname,
            path_prefix=url.path,
            latest_timestamp=latest_timestamp)

    for file_name, data_file, new_timestamp in versions:
        data = None
        try:
            data = list(csv.DictReader(data_file))
        finally:
            data_file.close()
        yield file_name, data, new_timestamp

def parse_args():
    parser = argparse.ArgumentParser()

    parser.add_argument('--input-file', '-f',
                       help='Read data from this CSV file')
    parser.add_argument('--input-s3', '-s',
                       help='Read data from the newest file in the given bucket with the given prefix')

    parser.add_argument('--database-url', '-d', default='sqlite:///vba_score.db',
                       help='Database in which to store the data')

    return parser.parse_args()

def import_scores():
    args = parse_args()

    session = get_db_session(args.database_url)

    new_version, latest_timestamp = get_version_info(session)
    versions = fetch_data(latest_timestamp, args.input_file, args.input_s3)
    for file_name, data, new_timestamp in versions:
        print("Adding version {} - {}".format(new_version, file_name))
        session.add(Version(id=new_version, time=new_timestamp))
        add_scores(session, new_version, data)
        session.commit()
        new_version += 1

if __name__ == '__main__':
    logging.basicConfig(format="%(asctime)s %(levelname)s %(name)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
    logging.Formatter.converter = time.gmtime
    # logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
    logging.info("starting asnrep score db import")
    try:
        import_scores()
        logging.info("success")
    except Exception:
        logging.error("unhandled exception", exc_info=True)
        sys.exit(-1)
    finally:
        logging.shutdown()
