#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import re
from collections import Counter

import luigi
from yt.wrapper import create_table_switch, OperationsTracker

from crypta.profile.lib import date_helpers

from crypta.profile.utils.config import config
from crypta.profile.utils.loggers import send_to_graphite
from crypta.profile.utils.luigi_utils import (
    YtTarget,
    AttributeExternalInput,
    BaseYtTask,
    OldNodesByNameCleaner,
)


MINIMUM_SESSION_LENGTH = 5
MAXIMUM_SESSION_LENGTH = 1000
SESSION_TIMEOUT = 600
MINIMUM_USER_EVENTS_HITS = 1

ALLOWED_HOST_REGEX = re.compile(r'(?!-)[A-Z\d-]{1,63}(?<!-)$', re.IGNORECASE)


def is_valid_hostname(host):
    if host is None:
        return False
    if len(host) == 0:
        return False
    if len(host) > 255:
        return False
    return all(ALLOWED_HOST_REGEX.match(x) for x in host.split('.'))


def preprocess_host(host):
    if host and is_valid_hostname(host):
        return host
    return None


class UserEventsProcessor(object):
    def __init__(self, min_session, max_session, session_timeout, min_hits, max_sites):
        self.min_session = min_session
        self.max_session = max_session
        self.session_timeout = session_timeout
        self.min_hits = min_hits
        self.max_sites = max_sites

    def is_session_valid(self, session):
        return (len(session) >= self.min_session) and (len(session) <= self.max_session)

    def __call__(self, key, rows):
        site_weights = Counter()

        previous_ts = 0
        previous_host = None
        session = []
        referrers = Counter()

        for row in rows:
            if 'event_type' in row and row['event_type'] == 'ln':
                continue

            host = preprocess_host(row['host'])
            ts = row['timestamp']
            if host:
                if host == previous_host:
                    continue
                else:
                    if ts - previous_ts <= self.session_timeout:
                        session.append(host)
                        if len(session) > self.max_session:
                            break
                    else:
                        if self.is_session_valid(session):
                            yield create_table_switch(1)
                            yield {
                                'yandexuid': key['yandexuid'],
                                'session': ' '.join(session),
                            }
                        session = [host]

                    previous_ts = ts
                    previous_host = host

                    site_weights[previous_host] += 1
                    if len(site_weights) > self.max_sites:
                        break

            if row['referer_host']:
                referrer_host = preprocess_host(row['referer_host'])
                if referrer_host and referrer_host != host:
                    referrers[referrer_host] += 1
                    if len(referrers) > self.max_sites:
                        break

        # yielding last session
        if self.is_session_valid(session):
            yield create_table_switch(1)
            yield {
                'yandexuid': key['yandexuid'],
                'session': ' '.join(session),
            }

        if referrers and len(referrers) <= self.max_sites:
            yield create_table_switch(2)
            yield {
                'yandexuid': key['yandexuid'],
                'referrers': referrers,
            }

        if sum(site_weights.values()) >= self.min_hits and len(site_weights) <= self.max_sites:
            yield create_table_switch(0)
            yield {
                'yandexuid': key['yandexuid'],
                'site_weights': site_weights,
            }


class ProcessUserEvents(BaseYtTask):
    date = luigi.Parameter()
    data_source = luigi.Parameter()

    priority = 100
    task_group = 'export_profiles'

    def requires(self):
        return {
            'parsed_log': AttributeExternalInput(
                table=os.path.join(config.PARSED_LOGS_YT_DIRECTORY, self.data_source, date_helpers.get_yesterday(self.date)),
                attribute_name='closed',
                attribute_value=True,
            ),
            'hits_cleaner': OldNodesByNameCleaner(
                self.date,
                folder=os.path.join(config.PROFILES_INPUT_YT_DIRECTORY, self.data_source, 'hits'),
                lifetime=config.NUMBER_OF_HITS_TABLES_TO_KEEP,
            ),
            'sessions_cleaner': OldNodesByNameCleaner(
                self.date,
                folder=os.path.join(config.PROFILES_INPUT_YT_DIRECTORY, self.data_source, 'sessions'),
                lifetime=config.STANDARD_AGGREGATION_PERIOD,
            ),
            'referrers_cleaner': OldNodesByNameCleaner(
                self.date,
                folder=os.path.join(config.PROFILES_INPUT_YT_DIRECTORY, self.data_source, 'referrers'),
                lifetime=config.STANDARD_AGGREGATION_PERIOD,
            ),
        }

    def output(self):
        table_template = os.path.join(
            config.PROFILES_INPUT_YT_DIRECTORY,
            self.data_source, '{}', date_helpers.get_yesterday(self.date),
        )

        return {
            'hits': YtTarget(table_template.format('hits')),
            'sessions': YtTarget(table_template.format('sessions')),
            'referrers': YtTarget(table_template.format('referrers')),
        }

    def run(self):
        columns_by_data_source = {
            'metrics': ('yandexuid', 'timestamp', 'host', 'referer_host', 'event_type'),
            'bar': ('yandexuid', 'timestamp', 'host', 'referer_host'),
        }

        with self.yt.Transaction():
            self.yt.create_empty_table(
                self.output()['hits'].table,
                schema={
                    'yandexuid': 'uint64',
                    'site_weights': 'any',
                },
            )

            self.yt.create_empty_table(
                self.output()['sessions'].table,
                schema={
                    'yandexuid': 'uint64',
                    'session': 'string',
                },
            )

            self.yt.create_empty_table(
                self.output()['referrers'].table,
                schema={
                    'yandexuid': 'uint64',
                    'referrers': 'any',
                },
            )

            self.yt.run_map_reduce(
                None,
                UserEventsProcessor(
                    min_session=MINIMUM_SESSION_LENGTH,
                    max_session=MAXIMUM_SESSION_LENGTH,
                    session_timeout=SESSION_TIMEOUT,
                    min_hits=MINIMUM_USER_EVENTS_HITS,
                    max_sites=config.MAXIMUM_SITES,
                ),
                self.yt.TablePath(
                    self.input()['parsed_log'].table,
                    columns=columns_by_data_source[self.data_source],
                ),
                [
                    self.output()['hits'].table,
                    self.output()['sessions'].table,
                    self.output()['referrers'].table,
                ],
                reduce_by='yandexuid',
                sort_by=['yandexuid', 'timestamp'],
                spec={
                    'title': 'Processing {} data for {}'.format(self.data_source, date_helpers.get_yesterday(self.date)),
                    'reducer': {
                        'memory_limit': 4 * 1024 * 1024 * 1024,
                    },
                }
            )

            with OperationsTracker() as tracker:
                tracker.add(
                    self.yt.run_sort(
                        self.output()['hits'].table,
                        sort_by='yandexuid',
                        sync=False,
                    )
                )
                tracker.add(
                    self.yt.run_sort(
                        self.output()['sessions'].table,
                        sort_by='yandexuid',
                        sync=False,
                    )
                )
                tracker.add(
                    self.yt.run_sort(
                        self.output()['referrers'].table,
                        sort_by='yandexuid',
                        sync=False,
                    )
                )

            self.yt.set_attribute(self.output()['hits'].table, 'min_hits', MINIMUM_USER_EVENTS_HITS)
            self.yt.set_attribute(self.output()['hits'].table, 'max_sites', config.MAXIMUM_SITES)

            self.yt.run_merge(
                self.output()['sessions'].table,
                self.output()['sessions'].table,
                mode='auto',
                spec={'combine_chunks': True},
            )

            self.yt.set_attribute(
                self.output()['sessions'].table,
                'min_session',
                MINIMUM_SESSION_LENGTH,
            )
            self.yt.set_attribute(
                self.output()['sessions'].table,
                'max_session',
                MAXIMUM_SESSION_LENGTH,
            )
            self.yt.set_attribute(
                self.output()['sessions'].table,
                'session_timeout',
                SESSION_TIMEOUT,
            )

            send_to_graphite(
                'input_data.user_events_processed.{}'.format(self.data_source),
                self.yt.row_count(self.input()['parsed_log'].table)
            )
            send_to_graphite(
                'hits.{}'.format(self.data_source),
                self.yt.row_count(self.output()['hits'].table),
            )
            send_to_graphite(
                'sessions.{}'.format(self.data_source),
                self.yt.row_count(self.output()['sessions'].table),
            )
            send_to_graphite(
                'referrers.{}'.format(self.data_source),
                self.yt.row_count(self.output()['referrers'].table),
            )
