# -*- encoding: utf-8 -*-
import Cookie
import os
import re
import urlparse
from datetime import datetime, timedelta
from optparse import OptionParser
from pytz import timezone

import yt.wrapper as yt

UTM_PARAMS = ['utm_source', 'utm_campaign', 'utm_term', 'utm_medium', 'utm_content']
AVIA_HOSTS = ['avia.yandex.ru', 'avia.yandex.ua', 'bilet.yandex.com.tr', 'avia.yandex.kz', 'flights.yandex.com']
ACCESS_LOG_SORTED_BY_FIELDS = ['canonized_vhost', 'raw_yandexuid', 'stbx_ip', 'iso_eventtime']
VALID_CODES = ['200'] + [str(code) for code in range(300, 308 + 1)]
REGEXP_UID = re.compile('^[0-9a-z]{1,32}$')


ALLOWED_ENVS = ['production', 'development']


def extract_yandex_uid(record):
    raw_yandexuid = record.get('raw_yandexuid')

    if raw_yandexuid and REGEXP_UID.match(raw_yandexuid):
        return raw_yandexuid

    # Try etract from cookies
    try:
        morsel = Cookie.BaseCookie(record.get('cookies', '')).get('yandexuid')
        return morsel.value if morsel else None
    except Cookie.CookieError:
        pass


@yt.aggregator
def utm_filter(records):
    for r in records:
        if r.get('vhost') not in AVIA_HOSTS:
            continue

        http_params_rewritten = r.get('args') or r.get('http_params_rewritten')

        try:
            http_params = urlparse.parse_qs(http_params_rewritten, keep_blank_values=True)

        except AttributeError:
            return

        utms = {}

        for p_name, v_val in http_params.items():
            if p_name in UTM_PARAMS:
                utms[p_name] = ','.join(set(v_val))

        yandexuid = extract_yandex_uid(r)
        status = r.get('status')

        if utms and yandexuid and status in VALID_CODES:
            # add empty values
            for param in UTM_PARAMS:
                utms[param] = utms.get(param, '')

            record = {
                'datetime': r.get('iso_eventtime'),
                'date': r.get('iso_eventtime')[:10],
                'yandexuid': yandexuid,
                'status': status,
            }

            record.update(utms)

            yield record


def main():
    os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'travel.avia.stat_admin.local_settings')
    import django
    django.setup()

    import logging
    from django.conf import settings

    from travel.avia.stat_admin.lib.logs import create_current_file_run_log
    from travel.avia.stat_admin.lib.yt_helpers import configure_wrapper

    current_env = settings.YANDEX_ENVIRONMENT_TYPE
    if current_env not in ALLOWED_ENVS:
        return

    configure_wrapper(yt)

    log = logging.getLogger(__name__)

    create_current_file_run_log()

    moscow_timezone = timezone('Europe/Moscow')

    optparser = OptionParser()
    optparser.add_option("-d", "--days", dest="days", type="int", help="number of days", default=1)
    options, args = optparser.parse_args()

    log.info("Start")
    log.info("says=%s", options.days)
    for day in range(options.days):
        YESTERDAY = moscow_timezone.localize(datetime.now()).date() - timedelta(days=day + 1)
        YESTERDAY_STR = YESTERDAY.strftime('%Y-%m-%d')
        # У нас есть свой access-log
        YESTERDAY_LOG = '//home/logfeller/logs/avia-access-log/1d/' + YESTERDAY_STR
        DESTINATION_LOG = '//home/rasp/logs/rasp-utm-log/' + YESTERDAY_STR

        log.info("From %s -> %s", YESTERDAY_LOG, DESTINATION_LOG)

        with yt.Transaction(timeout=60000 * 5):
            if not yt.exists(os.path.dirname(DESTINATION_LOG)):
                log.info("Create %s", DESTINATION_LOG)
                yt.create('map_node', os.path.dirname(DESTINATION_LOG), recursive=True)

            try:
                log.info("Run reduce %s -> %s", YESTERDAY_LOG, DESTINATION_LOG)

                yt.run_map(
                    source_table=YESTERDAY_LOG,
                    destination_table=DESTINATION_LOG,
                    binary=utm_filter,
                )

                log.info("Run sort %s", YESTERDAY_LOG)
                yt.run_sort(
                    source_table=DESTINATION_LOG,
                    sort_by=['yandexuid', 'date', 'datetime'] + UTM_PARAMS
                )

            except Exception:
                log.error("Error while %s", YESTERDAY_STR, exc_info=True)
                continue

    log.info("Done")

if __name__ == '__main__':
    main()
