# -*- encoding: utf-8 -*-
import csv
import logging
import os
from collections import defaultdict
from datetime import datetime, timedelta
from optparse import OptionParser

os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'travel.avia.stat_admin.local_settings')
import django
django.setup()

from django.conf import settings

import yt.wrapper as yt

UTM_PARAMS = ['utm_source', 'utm_campaign', 'utm_medium', 'utm_term', 'utm_content']
UTM_SOURCES = ['ohm_inventory']
UTM_LOG_TEMPLATE = '//home/rasp/logs/rasp-utm-log/{log_date}'
REDIR_LOG_TEMPLATE = '//home/avia/logs/avia-redir-log/{log_date}'


log = logging.getLogger(__name__)


def main():
    from travel.avia.stat_admin.lib.yt_helpers import configure_wrapper

    configure_wrapper(yt)
    today = datetime.today().date()
    default_right_border = (today - timedelta(days=1))
    default_left_border = (default_right_border - timedelta(days=7))

    optparser = OptionParser()
    optparser.add_option("-s", "--start", dest="start", type="string", help="date start", default=default_left_border.strftime('%Y-%m-%d'))
    optparser.add_option("-e", "--end", dest="end", type="string", help="date end", default=default_right_border.strftime('%Y-%m-%d'))
    optparser.add_option("-c", "--cohorts", dest="cohorts", type="int", help="number of cohorts", default=8)
    options, args = optparser.parse_args()

    options.start = datetime.strptime(options.start, '%Y-%m-%d').date()
    options.end = datetime.strptime(options.end, '%Y-%m-%d').date()

    # Считаем входящие
    utms = {}
    for x in range((options.end - options.start).days + 1):
        utm_log = UTM_LOG_TEMPLATE.format(log_date=options.end - timedelta(days=x))
        print x, 'Read', utm_log

        if not yt.exists(utm_log):
            continue

        for record in yt.read_table(utm_log, format=yt.JsonFormat(), raw=False):
            if record.get('utm_source') not in UTM_SOURCES:
                continue

            yandexuid = record.get('yandexuid')

            if not yandexuid:
                continue

            utm_key = tuple([record.get(p, '') for p in UTM_PARAMS])
            event_datetime = datetime.strptime(record['datetime'], "%Y-%m-%d %H:%M:%S")
            utm_val = (event_datetime, utm_key)

            if yandexuid not in utms:
                utms[yandexuid] = utm_val
                continue

            existing_event_datetime, _ = utms[yandexuid]

            if event_datetime < existing_event_datetime:
                utms[yandexuid] = utm_val

    # Считаем исходящие и строим когорты
    cohorts = {}
    for x in range(options.cohorts * 7):
        redir_log = REDIR_LOG_TEMPLATE.format(log_date=options.end - timedelta(days=x))
        print 'Read', redir_log

        if not yt.exists(redir_log):
            continue

        for record in yt.read_table(redir_log, format=yt.JsonFormat(), raw=False):
            yandexuid = record.get('yandexuid')
            redir_eventtime = datetime.strptime(record['iso_eventtime'], '%Y-%m-%d %H:%M:%S')

            if yandexuid in utms:
                incoming_datetime, utm_key = utms[yandexuid]
                cohort_num = (incoming_datetime - redir_eventtime).days / 7

                if cohort_num < 0:
                    continue

                if cohort_num > options.cohorts:
                    continue

                if utm_key not in cohorts:
                    cohorts[utm_key] = {}

                    for x in range(options.cohorts):
                        cohorts[utm_key][x] = 0

                cohorts[utm_key][cohort_num] += 1

    sorted_keys = sorted(cohorts.keys())
    cohorts_total = defaultdict(int)

    for yandexuid, utm_val in utms.items():
        event_datetime, utm_key = utm_val
        cohorts_total[utm_key] += 1

    dir_root = os.path.join(settings.REPORTS_STORAGE_ROOT, 'cohorts')

    if not os.path.exists(dir_root):
        os.makedirs(dir_root)

    file_name = '{dir_root}/cohorts_{date_from}-{date_to}.csv'.format(
        dir_root=dir_root,
        date_from=options.start.strftime('%Y-%m-%d'),
        date_to=options.end.strftime('%Y-%m-%d'),
    )

    ofile = open(file_name, 'w')
    owriter = csv.writer(ofile, delimiter=';', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
    header = [p for p in UTM_PARAMS] + ['incoming'] + [x in range(options.cohorts)]
    owriter.writerow(header)

    for key in sorted_keys:
        row = list(key) + [cohorts_total[key]] + [cohorts[key][x] for x in range(options.cohorts)]
        owriter.writerow(row)


if __name__ == '__main__':
    main()
