# -*- coding: utf-8 -*-
import os

os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'travel.avia.stat_admin.local_settings')
import django
django.setup()

import argparse
import Cookie
import gzip
import logging
from cStringIO import StringIO

import pytz
import urlparse
import requests

from cachetools.func import lru_cache
from datetime import datetime, timedelta
from itertools import groupby
from pytz import timezone

from travel.avia.stat_admin.lib.logs import add_stdout_handler, create_current_file_run_log
from travel.avia.stat_admin.lib.yt_helpers import configure_wrapper
from travel.avia.stat_admin.data.models import OhmUtm, OhmIncoming, Setting
from django.conf import settings
from django.db import IntegrityError
from django.db.transaction import atomic

import yt.wrapper as yt

log = logging.getLogger(__name__)


UTM_PARAMS = ('utm_source', 'utm_campaign', 'utm_term', 'utm_medium', 'utm_content')
IMPORTANT_SOURCES_STARTSWITH = (
    'dist',
    'doubletrade',
    'facebook',
    'google',
    'homescreen',
    'ohm',
    'radio',
    'rasp',
    'serp',
    'sovetnik',
    'travelaudience',
    'viber',
    'wizard',
    'ya',
    'yama',
    'yandex'
)
DATA_VERSION = 2
MOSCOW_TIMEZONE = timezone('Europe/Moscow')
SETTING_KEY = 'UTM_V3_LAST_IMPORT_TIME'
SETTING_DEFAULTS = {
    'value': '',
}
MSK_NOW_STR = MOSCOW_TIMEZONE.localize(
    datetime.now()
).strftime(
    '%Y-%m-%d %H:%M:%S'
)
MSK_YESTERDAY_STR = MOSCOW_TIMEZONE.localize(
    datetime.now() - timedelta(days=1)
).strftime(
    '%Y-%m-%d %H:%M:%S'
)
ALLOWED_DC = ['sas']
TOPIC_LOG_TEMPLATE = '/tmp/lb_topics/{topic}-{date}.gz'
MIN_FREE_SPACE = (1024 ** 3) * 3
ENABLE_TOPIC_LOG = True
ENABLE_YT_DELIVERY = True
YT_UTM_LOG_PATH = '//home/rasp/logs/rasp-utm-log-v2'
YT_UTM_LOG_SCHEMA = [
    {'name': 'yandexuid', 'type': 'string'},
    {'name': 'date', 'type': 'string'},
    {'name': 'datetime', 'type': 'string'},
    {'name': 'utm_source', 'type': 'string'},
    {'name': 'utm_campaign', 'type': 'string'},
    {'name': 'utm_term', 'type': 'string'},
    {'name': 'utm_medium', 'type': 'string'},
    {'name': 'utm_content', 'type': 'string'},
    {'name': 'status', 'type': 'int64'},
    {'name': 'version', 'type': 'int64'},
]

try:
    with open('/etc/conductor-hostinfo/hostinfo', 'rb') as f:
        HOST_INFO = {k: v.strip("'") for k, v in (
            row.strip().split('=', 1) for row in f.readlines()
        )}
except IOError:
    HOST_INFO = {
        'dc': os.getenv('DEPLOY_NODE_DC', '').lower(),
    }


class LogBrokerTopicReader(object):
    logbroker_root_template = 'http://{dc}.logbroker.yandex.net:8999'

    pull_list_template = '{lb_root}/pull/list?log-type={log_type}'
    pull_offsets_template = '{lb_root}/pull/offsets?client={client_id}&topic={topic}'
    pull_suggest_template = '{lb_root}/pull/suggest?client={client_id}&topic={topic}'

    pull_session_template = 'http://{hostname}/pull/session?client={client_id}&topic={topic}'
    pull_read_template = 'http://{hostname}/pull/read?client={client_id}&topic={topic}&format=raw&session={session}&limit={limit}'
    pull_commit_template = 'http://{hostname}/pull/commit?client={client_id}&topic={topic}&session={session}&offset={offset}'

    def __init__(self, dc, client_id, log_type, max_row_read_limit=500000):
        self.client_id = client_id
        self.current_dc = dc
        self.log_type=log_type
        self.limit = 5000
        self.rows_read_count = 0
        self.max_row_read_limit = max_row_read_limit
        self.s = requests.session()
        self.session_id = None
        self.last_offset = None
        self.max_offset = None
        self.hostname = None
        self.topic = None

    def parse_tksv_line(self, line):
        record = {}
        for pair in line.strip().split('\t'):
            items = pair.split('=', 1)
            if len(items) == 2:
                k, v = [i.strip() for i in items]
                record[k] = v

        return record

    def logbroker_root(self):
        return self.logbroker_root_template.format(dc=self.current_dc)

    def pull_list_url(self):
        return self.pull_list_template.format(
            lb_root=self.logbroker_root(),
            log_type=self.log_type
        )

    def pull_offsets_url(self, topic):
        return self.pull_offsets_template.format(
            lb_root=self.logbroker_root(), client_id=self.client_id, topic=topic
        )

    def pull_session_url(self, hostname, topic):
        return self.pull_session_template.format(
            hostname=hostname, client_id=self.client_id, topic=topic
        )

    def pull_read_url(self):
        return self.pull_read_template.format(
            hostname=self.hostname,
            client_id=self.client_id,
            topic=self.topic,
            session=self.session_id,
            limit=self.limit,
        )

    def pull_suggest_url(self, topic):
        return self.pull_suggest_template.format(
            lb_root=self.logbroker_root(), client_id=self.client_id, topic=topic
        )

    def remember_current_offset(self, topic):
        url = self.pull_offsets_url(topic)
        log.info('pull/offset: %s', url)
        try:
            r = requests.get(url)
            lag = int(r.content.strip().split('\t')[4])

            self.last_offset = int(r.content.strip().split('\t')[1])
            self.max_offset = self.last_offset + lag
            log.info('topic: {topic}, read lag: {lag}'.format(topic=topic, lag=lag))
        except requests.exceptions.RequestException as e:
            raise BadAPIResponse(e)
        except Exception:
            raise BadAPIResponse(r.content)

    def get_topics(self):
        url = self.pull_list_url()
        log.info('pull/list: %s', url)
        try:
            r = requests.get(url)
        except requests.exceptions.RequestException as e:
            raise BadAPIResponse(e)
        except Exception:
            raise BadAPIResponse(r.content)

        return [line.strip() for line in r.content.strip().split('\n')]

    def filter_by_dc_topic(self, topics):
        for t in topics:
            if t.split('.')[1].startswith(self.current_dc):
                return t

    def get_topic_info(self, topic):
        url = self.pull_offsets(topic)
        log.info('pull/offset: %s', url)
        try:
            r = requests.get(url)
            lines = r.content.strip().split('\n')
        except requests.exceptions.RequestException as e:
            raise BadAPIResponse(e)
        except Exception:
            raise BadAPIResponse(r.content)

        for line in lines:
            yield line.split('\t')

    def start_session(self):
        session_url = self.pull_session_url(self.hostname, self.topic)
        log.info('Create new session: %s', session_url)
        try:
            r = self.s.get(session_url)
            self.session_id = r.headers['Session']
        except requests.exceptions.RequestException as e:
            raise BadAPIResponse(e)
        except Exception:
            raise BadAPIResponse(r.content)

        log.info('Use session ID: %s', self.session_id)

    def remember_free_partition(self, topic):
        try:
            r = requests.get(self.pull_suggest_url(topic))
            self.hostname, self.topic = r.content.strip().split('\t')
        except requests.exceptions.RequestException as e:
            raise BadAPIResponse(e)
        except Exception:
            raise BadAPIResponse(r.content)

    def read(self):
        url = self.pull_read_url()
        log.info('\tGet DATA: %s', url)
        try:
            r = self.s.get(url)
            content = r.content
            while content:
                header, body = content.split('\n', 1)
                header = self.parse_tksv_line(header)

                if header['codec'] != 'gzip':
                    log.error('Unsupported codec: {}'.format(header['codec']))

                size = int(header['size'])
                with gzip.GzipFile(fileobj=StringIO(body[:size]), mode='rb') as inp:
                    for _line in inp:
                        row = self.parse_tksv_line(_line)
                        yield row

                self.last_offset = int(header['offset'])

                content = body[size:]

        except requests.exceptions.RequestException as e:
            raise BadAPIResponse(e)
        except Exception:
            raise BadAPIResponse(r.content)

    def commit_last_row(self):
        if self.last_offset:
            commit_url = self.pull_commit_template.format(
                hostname=self.hostname,
                client_id=self.client_id,
                topic=self.topic,
                session=self.session_id,
                offset=self.last_offset,
            )
            log.info('\tCommit: %s', commit_url)

            try:
                self.s.get(commit_url)
            except requests.exceptions.RequestException as e:
                raise BadAPIResponse(e)

    def new_rows(self):
        topics = self.get_topics()

        for topic in topics:
            try:
                self.remember_current_offset(topic)
                self.remember_free_partition(topic)
                self.start_session()
            except BadAPIResponse as e:
                log.warning('Bad API responce: %s', e)
                continue
            except:
                log.error("General except")
                continue

            while self.last_offset < self.max_offset:
                for row in self.read():
                    self.rows_read_count += 1
                    yield row

                self.commit_last_row()

                if self.rows_read_count >= self.max_row_read_limit:
                    log.info("Max row limit reached")
                    return


class TopicNotFound(Exception):
    pass


class BadAPIResponse(Exception):
    pass


@lru_cache(maxsize=None)
def get_utm(source, campaign, medium, term):
    try:
        utm_db, _created = OhmUtm.objects.get_or_create(
            source=source,
            campaign=campaign,
            medium=medium,
            term=term,
        )
    except IntegrityError:
        log.warning(
            'source=%r, campaign=%r, medium=%r, term=%r',
            source, campaign, medium, term
        )
        return

    return utm_db


def extract_utms(raw_http_args):
    http_args = urlparse.parse_qs(raw_http_args, keep_blank_values=True)

    utms = {}
    for k, v in http_args.items():
        if k in UTM_PARAMS:
            utms[k] = ','.join(sorted(set(v)))

    return utms


def is_important_utms(utms):
    return utms.get('utm_source').startswith(IMPORTANT_SOURCES_STARTSWITH)


def extract_yandexuid(raw_cookies):
    if raw_cookies:
        try:
            morsel = Cookie.BaseCookie(raw_cookies).get('yandexuid')
        except Cookie.CookieError:
            return None

        return morsel.value if morsel else None


def make_tksv_line(row):
    return 'tskv\t' + '\t'.join(['%s=%s' % (k, v) for k, v in row.iteritems()])


# Сделать реверт на этот коммит после окончания работ над когортным API v3
# Запись потока будет не нужна, т.к. этот поток все равно ложится в YT
def log_rows(log_type, rows):
    if not settings.ENABLE_TOPIC_LOG:
        return

    date_str = MSK_NOW_STR[:10]
    file_name = TOPIC_LOG_TEMPLATE.format(topic=log_type, date=date_str)
    dir_name = os.path.dirname(file_name)

    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

    # Места мало на машинах, побеспокоимся о свободном месте
    s = os.statvfs(dir_name)
    free_space = s.f_bavail * s.f_frsize

    if free_space > MIN_FREE_SPACE:
        with gzip.open(file_name, 'a') as f:
            lines = [make_tksv_line(row) for row in rows]

            try:
                f.writelines(['%s\n' % line for line in lines])
            except Exception:
                pass


def utc_to_moscow(utc_dt):
    local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(MOSCOW_TIMEZONE)
    return MOSCOW_TIMEZONE.normalize(local_dt)


def import_incomings(dc, client_id, log_type):
    log.info('Start import incomings: %s', log_type)

    lb_reader = LogBrokerTopicReader(
        dc=dc,
        client_id=client_id,
        log_type=log_type,
        max_row_read_limit=750000,
    )

    ohm_incomings = []
    rows_with_utms = []
    ohm_to_yt = []

    for row in lb_reader.new_rows():
        raw_http_args = row.get('args')
        if raw_http_args:
            utms = extract_utms(raw_http_args)
            yandexuid = extract_yandexuid(row.get('cookies'))

            iso_eventtime = row.get('timestamp')
            if not iso_eventtime:
                continue
            iso_eventtime = MOSCOW_TIMEZONE.localize(
                datetime.strptime(iso_eventtime, '%Y-%m-%dT%H:%M:%S')
            )

            if utms.get('utm_source'):
                utm = get_utm(
                    utms.get('utm_source', ''),
                    utms.get('utm_campaign', ''),
                    utms.get('utm_medium', ''),
                    utms.get('utm_term', ''),
                )
                if not utm:
                    log.warning('Can\'t get utm')
                    continue

                if yandexuid:
                    # В базу пишем только важные, а в YT -- все
                    if is_important_utms(utms):
                        ohm_incoming = OhmIncoming(
                            eventdate=iso_eventtime.date(),
                            eventdatetime=iso_eventtime,
                            utm=utm,
                            yandexuid=yandexuid,
                            version=DATA_VERSION,
                        )

                        ohm_incomings.append(ohm_incoming)

            if utms:
                rows_with_utms.append(row)

                ohm_yt = {
                    'utm_source': utms.get('utm_source'),
                    'utm_campaign': utms.get('utm_campaign'),
                    'utm_term': utms.get('utm_term'),
                    'utm_medium': utms.get('utm_medium'),
                    'utm_content': utms.get('utm_content'),
                    'yandexuid': yandexuid,
                    'date': iso_eventtime.strftime('%Y-%m-%d'),
                    'datetime': iso_eventtime.strftime('%Y-%m-%d %H:%M:%S'),
                    'status': int(row.get('status')),
                    'version': DATA_VERSION,
                }
                ohm_to_yt.append(ohm_yt)

    if rows_with_utms:
        log_rows(log_type, rows_with_utms)

    if ohm_incomings:
        OhmIncoming.objects.bulk_create(ohm_incomings, batch_size=1000)

    if ohm_to_yt and settings.ENABLE_YT_DELIVERY:
        log.info('Write to YT')
        write_ohm_incomings_to_yt(ohm_to_yt, YT_UTM_LOG_PATH)

    log.info('%r incomings saved to db', len(ohm_incomings))


def create_utm_table(table_path):
    yt.create_table(table_path, recursive=True, attributes={
        'schema': YT_UTM_LOG_SCHEMA,
        'optimize_for': 'scan',
    })


def write_ohm_incomings_to_yt(ohm_incomings, log_path):
    for eventdate, group in groupby(ohm_incomings, lambda x: x.get('date')):
        table_path = os.path.join(log_path, eventdate)
        table = yt.TablePath(os.path.join(log_path, eventdate), append=True)
        if not yt.exists(table):
            create_utm_table(table_path)
        yt.write_table(table, group)


@atomic
def do_import():
    dc = HOST_INFO['dc']

    if dc not in ALLOWED_DC:
        log.info('DC: %s not allowed', dc)
        return

    if not settings.LB_CLIENT_ID:
        log.info('Empty client_id')
        return

    log.info('Use data center: %s', dc)
    log.info('Use client_id: %s', settings.LB_CLIENT_ID)

    setting, _created = Setting.objects.get_or_create(key=SETTING_KEY, defaults=SETTING_DEFAULTS)

    import_incomings(dc, settings.LB_CLIENT_ID, 'avia-access-log')

    setting.value = MSK_NOW_STR
    setting.save()


def main():
    create_current_file_run_log()
    parser = argparse.ArgumentParser()
    parser.add_argument('-v', '--verbose', action='store_true', dest='verbose')

    args = parser.parse_args()
    if args.verbose:
        add_stdout_handler(log)

    configure_wrapper(yt)
    try:
        do_import()
    except Exception:
        log.exception("Unexpected error in import_topics")

if __name__ == '__main__':
    main()
