#!/usr/bin/env python
# -*- coding: utf-8 -*-
import collections
import os

import luigi
from yt.wrapper import with_context

from crypta.profile.lib import date_helpers

from crypta.profile.utils.config import config
from crypta.profile.utils.luigi_utils import YtDailyRewritableTarget, BaseYtTask
from crypta.profile.tasks.features.process_user_events import ProcessUserEvents

# from https://en.wikipedia.org/wiki/List_of_Google_domains
google_tlds = (
    'ac', 'ad', 'ae', 'al', 'am', 'as', 'at', 'az', 'ba', 'be', 'bf', 'bg', 'bi', 'bj', 'bs', 'bt', 'by', 'ca',
    'cat', 'cc', 'cd', 'cf', 'cg', 'ch', 'ci', 'cl', 'cm', 'cn', 'co.ao', 'co.bw', 'co.ck', 'co.cr', 'co.id',
    'co.il', 'co.in', 'co.jp', 'co.ke', 'co.kr', 'co.ls', 'com', 'co.ma', 'com.af', 'com.ag', 'com.ai', 'com.ar',
    'com.au', 'com.bd', 'com.bh', 'com.bn', 'com.bo', 'com.br', 'com.bz', 'com.co', 'com.cu', 'com.cy', 'com.do',
    'com.ec', 'com.eg', 'com.et', 'com.fj', 'com.gh', 'com.gi', 'com.gt', 'com.hk', 'com.jm', 'com.kh', 'com.kw',
    'com.lb', 'com.lc', 'com.ly', 'com.mm', 'com.mt', 'com.mx', 'com.my', 'com.na', 'com.nf', 'com.ng', 'com.ni',
    'com.np', 'com.om', 'com.pa', 'com.pe', 'com.pg', 'com.ph', 'com.pk', 'com.pr', 'com.py', 'com.qa', 'com.sa',
    'com.sb', 'com.sg', 'com.sl', 'com.sv', 'com.tj', 'com.tr', 'com.tw', 'com.ua', 'com.uy', 'com.vc', 'com.vn',
    'co.mz', 'co.nz', 'co.th', 'co.tz', 'co.ug', 'co.uk', 'co.uz', 'co.ve', 'co.vi', 'co.za', 'co.zm', 'co.zw',
    'cv', 'cz', 'de', 'dj', 'dk', 'dm', 'dz', 'ee', 'es', 'fi', 'fm', 'fr', 'ga', 'ge', 'gf', 'gg', 'gl', 'gm',
    'gp', 'gr', 'gy', 'hn', 'hr', 'ht', 'hu', 'ie', 'im', 'io', 'iq', 'is', 'it', 'je', 'jo', 'kg', 'ki', 'kz',
    'la', 'li', 'lk', 'lt', 'lu', 'lv', 'md', 'me', 'mg', 'mk', 'ml', 'mn', 'ms', 'mu', 'mv', 'mw', 'ne', 'net',
    'nl', 'no', 'nr', 'nu', 'org', 'pl', 'pn', 'ps', 'pt', 'ro', 'rs', 'ru', 'rw', 'sc', 'se', 'sh', 'si', 'sk',
    'sm', 'sn', 'so', 'sr', 'st', 'td', 'tg', 'tk', 'tl', 'tm', 'tn', 'to', 'tt', 'vg', 'vu', 'ws',
)

yandex_search_domains = {'yandex.ru', 'yandex.ua', 'yandex.kz', 'yandex.by', 'yandex.com.tr'}
yandex_search_competitor_domains = {'go.mail.ru', 'rambler.ru', 'nova.rambler.ru', 'bing.com'}
yandex_search_competitor_domains.update(['google.{tld}'.format(tld=google_tld) for google_tld in google_tlds])


@with_context
class MergeReferrersReducer(object):
    def __init__(self, min_hosts, max_hosts):
        self.min_hosts = min_hosts
        self.max_hosts = max_hosts

    def __call__(self, key, rows, context):
        referrers = collections.Counter()
        table_indices = set()

        for row in rows:
            table_indices.add(context.table_index)

            for host, visit_count in row['referrers'].iteritems():
                referrers[host] += visit_count
                # too many hosts
                if len(referrers) > self.max_hosts:
                    return

            if 0 not in table_indices:  # user was not active yesterday
                return

        if len(referrers) < self.min_hosts:
            # too few hosts
            return

        yield {
            'yandexuid': key['yandexuid'],
            'referrers': referrers,
        }


class MergeReferrersByYandexuid(BaseYtTask):
    date = luigi.Parameter()
    priority = 90
    task_group = 'export_profiles'

    def requires(self):
        return ProcessUserEvents(date=self.date, data_source='metrics')

    def output(self):
        return YtDailyRewritableTarget(
            config.METRICS_MERGED_REFERRERS_BY_YANDEXUID_TABLE,
            self.date,
        )

    def run(self):
        yesterday = date_helpers.get_yesterday(self.date)
        source_tables = []
        for date in reversed(sorted(date_helpers.generate_back_dates(yesterday, config.STANDARD_AGGREGATION_PERIOD))):
            daily_table = os.path.join(os.path.dirname(self.input()['referrers'].table), str(date))

            if self.yt.exists(daily_table):
                source_tables.append(daily_table)

        with self.yt.Transaction():
            self.yt.create_empty_table(
                self.output().table,
                schema={
                    'yandexuid': 'uint64',
                    'referrers': 'any',
                },
            )

            self.yt.run_reduce(
                MergeReferrersReducer(min_hosts=config.MINIMUM_SITES, max_hosts=config.MAXIMUM_SITES),
                source_tables,
                self.output().table,
                reduce_by='yandexuid',
                spec={
                    'reducer': {'memory_limit': 2 * 1024 * 1024 * 1024},
                },
            )

            self.yt.run_sort(
                self.output().table,
                sort_by='yandexuid',
                spec={'title': 'Sort merged hits'}
            )
            self.yt.set_attribute(self.output().table, 'generate_date', self.date)

            self.yt.set_attribute(self.output().table, 'source_tables', source_tables)
            self.yt.set_attribute(self.output().table, 'min_hosts', int(config.MINIMUM_SITES))
            self.yt.set_attribute(self.output().table, 'max_hosts', int(config.MAXIMUM_SITES))


def calculate_loyalty(row):
    yandex_visits = 0
    competitor_visits = 0

    for host, visit_count in row['referrers'].iteritems():
        if host in yandex_search_domains:
            yandex_visits += visit_count
        elif host in yandex_search_competitor_domains:
            competitor_visits += visit_count

    if (yandex_visits + competitor_visits) > 3:
        yandex_loyalty = float(yandex_visits) / float(yandex_visits + competitor_visits)
        yield {
            'yandexuid': row['yandexuid'],
            'yandex_loyalty': yandex_loyalty,
        }


class CalculateLoyalty(BaseYtTask):
    date = luigi.Parameter()
    priority = 90
    task_group = 'export_profiles'

    def requires(self):
        return MergeReferrersByYandexuid(self.date)

    def output(self):
        return YtDailyRewritableTarget(config.YANDEX_LOYALTY_TABLE, self.date)

    def run(self):
        with self.yt.Transaction():
            self.yt.create_empty_table(
                self.output().table,
                schema={'yandexuid': 'uint64', 'yandex_loyalty': 'double'},
            )

            self.logger.info('yandex_competitor_domains = {}'.format(yandex_search_competitor_domains))
            self.yt.run_map(
                calculate_loyalty,
                self.input().table,
                self.output().table,
            )
            self.yt.run_sort(self.output().table, sort_by='yandexuid')
            self.yt.set_attribute(self.output().table, 'generate_date', self.date)
