# -*- encoding: utf-8 -*-
import os
import re
import urlparse
from datetime import date, datetime, timedelta
from collections import defaultdict
from optparse import OptionParser

from cachetools.func import lru_cache
import yt.wrapper as yt

UTM_PARAMS = ['utm_source', 'utm_campaign', 'utm_term', 'utm_medium']
AVIA_HOSTS = ['avia.yandex.ru', 'avia.yandex.ua', 'bilet.yandex.com.tr']
VALID_CODES = ['200'] + [str(code) for code in range(300, 308 + 1)]
VALID_SOURCES_STARTSWITH = (
    'adwords',
    'google',
    'ohm',
    'serp',
    'travelaudience',
    'ya',
)

REGEXP = re.compile(r'^[a-zA-Z0-9\_]+$')
REGEXP_SOURCE = re.compile(r'^[a-zA-Z\_]+$')
REGEXP_UID = re.compile(r'^[0-9]+$')

MAX_JUNK_COUNT = 3


def extract_yandex_uid(record):
    cookies = record.get('cookies')

    for cookie in cookies.split('; '):
        try:
            cookie_name, cookie_val = cookie.split('=')

        except ValueError:
            return

        if cookie_name == 'yandexuid':
            return cookie_val


def utm_filter(key, records):
    if key.get('canonized_vhost') not in AVIA_HOSTS:
        return

    for r in records:
        http_params_rewritten = r.get('http_params_rewritten')

        try:
            http_params = urlparse.parse_qs(http_params_rewritten, keep_blank_values=True)

        except AttributeError:
            return

        utms = {}

        for p_name, v_val in http_params.items():
            if p_name in UTM_PARAMS:
                utms[p_name] = ','.join(set(v_val))

        yandexuid = extract_yandex_uid(r)
        status = r.get('status')

        if utms and yandexuid and status in VALID_CODES:
            # add empty values
            for param in UTM_PARAMS:
                if not utms.get(param):
                    utms[param] = ''

            record = {
                'datetime': r.get('iso_eventtime'),
                'date': r.get('iso_eventtime')[:10],
                'yandexuid': yandexuid,
                'status': status,
            }

            record.update(utms)

            yield record


def main():
    os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'travel.avia.stat_admin.local_settings')
    import django
    django.setup()

    import logging

    from travel.avia.stat_admin.lib.yt_helpers import configure_wrapper

    from django.conf import settings
    from django.db import transaction

    from travel.avia.stat_admin.data.models import UTM, UTM_incoming, UTM_redirect

    configure_wrapper(yt)
    yt.config['pickling']['module_filter'] = lambda module: hasattr(module, '__file__') and not module.__file__.endswith('.so')
    yt.config['pickling']['force_using_py_instead_of_pyc'] = True

    log = logging.getLogger(__name__)

    @lru_cache(maxsize=None)
    def get_utm(utm_source, utm_campaign, utm_medium):
        utm_db, created = UTM.objects.get_or_create(
            source=utm_source,
            campaign=utm_campaign,
            medium=utm_medium,
        )

        return utm_db

    def get_last_incoming(yandexuid):
        return UTM_incoming.objects.filter(
            yandexuid=yandexuid
        ).order_by('-eventdate')[:1]

    optparser = OptionParser()
    optparser.add_option("-d", "--days", dest="days", type="int", help="number of days", default=1)
    optparser.add_option("-k", dest="days_skip", type="int", help="skip number of days", default=0)
    options, args = optparser.parse_args()

    for day in reversed(range(options.days)):
        YESTERDAY = date.today() - timedelta(days=day + 1) - timedelta(days=options.days_skip)
        YESTERDAY_STR = YESTERDAY.strftime('%Y-%m-%d')
        DESTINATION_LOG = '//home/rasp/logs/rasp-utm-log/' + YESTERDAY_STR
        REDIR_LOG = '//home/rasp/logs/rasp-popular-flights-log/' + YESTERDAY_STR

        log.info('Process %s', YESTERDAY_STR)

        if yt.exists(DESTINATION_LOG):
            inc_count = UTM_incoming.objects.filter(eventdate=YESTERDAY).count()
            red_count = UTM_redirect.objects.filter(eventdate=YESTERDAY).count()

            if inc_count > 0 and red_count > 0:
                continue

            with transaction.atomic():
                UTM_incoming.objects.filter(eventdate=YESTERDAY).delete()

                log.info('Read %s', DESTINATION_LOG)

                records_dict = defaultdict(list)

                for record in yt.read_table(DESTINATION_LOG, format=yt.JsonFormat(), raw=False):
                    yandexuid = record['yandexuid'].strip()
                    utm_source = record['utm_source'].strip()
                    utm_campaign = record['utm_campaign'].strip()
                    utm_medium = record['utm_medium'].strip()

                    key = '%s_%s_%s_%s' % (
                        yandexuid, utm_source, utm_campaign, utm_medium
                    )

                    records_dict[key].append(record)

                bulc_objects = []
                skip_count = 0
                for key, records in records_dict.items():
                    last_datetime = None
                    for record in records:
                        eventtime = datetime.strptime(record['datetime'], '%Y-%m-%d %H:%M:%S')

                        if last_datetime and (eventtime - last_datetime).total_seconds() < 5 * 60:
                            skip_count += 1
                            last_datetime = eventtime
                            continue

                        last_datetime = eventtime

                        utm_source = record['utm_source'].strip()
                        utm_campaign = record['utm_campaign'].strip()
                        utm_medium = record['utm_medium'].strip()
                        yandexuid = record['yandexuid'].strip()

                        conditions = [
                            REGEXP_SOURCE.match(utm_source),
                            REGEXP.match(utm_campaign),
                            REGEXP.match(utm_medium),
                            REGEXP_UID.match(yandexuid),
                            utm_source.startswith(VALID_SOURCES_STARTSWITH)
                        ]

                        if not all(conditions):
                            continue

                        try:
                            utm_db = get_utm(utm_source, utm_campaign, utm_medium)

                        except Exception:
                            log.info("%s %s %s", utm_source, utm_campaign, record['utm_medium'])
                            continue

                        utm_incoming_db = UTM_incoming(
                            eventdate=YESTERDAY,
                            utm=utm_db,
                            yandexuid=yandexuid
                        )

                        bulc_objects.append(utm_incoming_db)

                log.info('Store UTM_incoming: %s (%s skipped)', len(bulc_objects), skip_count)
                UTM_incoming.objects.bulk_create(bulc_objects, batch_size=1000)

                UTM_redirect.objects.filter(eventdate=YESTERDAY).delete()

                log.info('Read %s', REDIR_LOG)
                utm_redirect_db_objects = []
                for record in yt.read_table(REDIR_LOG, format=yt.JsonFormat(), raw=False):
                    yandexuid = record.get('yandexuid').strip()

                    try:
                        last_incoming = get_last_incoming(yandexuid)

                    except IndexError:
                        continue

                    if not last_incoming:
                        continue

                    last_incoming = last_incoming[0]

                    cohort = int((YESTERDAY - last_incoming.eventdate).days / 7)

                    if cohort > 7 or cohort < 0:
                        continue

                    utm_redirect_db = UTM_redirect(
                        eventdate=YESTERDAY,
                        utm=last_incoming.utm,
                        yandexuid=yandexuid,
                        cohort=cohort,
                        price=settings.AVIA_CLICK_PRICE
                    )

                    utm_redirect_db_objects.append(utm_redirect_db)

                log.info('Store UTM_redirect: %s', len(utm_redirect_db_objects))
                UTM_redirect.objects.bulk_create(utm_redirect_db_objects, batch_size=1000)

        else:
            log.info('Not found %s', DESTINATION_LOG)

    # Удаляем UTM без связей
    utms = UTM.objects.all()
    for utm in utms:
        if not UTM_incoming.objects.filter(utm=utm).count():
            UTM_redirect.objects.filter(utm=utm)
            utm.delete()


if __name__ == '__main__':
    main()
