# -*- coding: utf-8 -*-

import re
import urlparse
import operator
import functools

import yt.wrapper as yt
from blockstat import parse_blocks, BlocksValidationError
from uniq_urls import uniq_urls
from baobab_matcher import BaobabMatcher, SkipEventException

# отсекаем не-серп
REQ_PATH_REG_EXP = re.compile("^(/blogs|/people)?/search/(?!smart)")

# отсекаем не продуктовые инсталляции серпа
PROD_YANDEX_HOST_REG_EXP = re.compile(r"^(www.)?yandex\.[\w.]+")

YANDEX_TLD_REG_EXP = re.compile(r"yandex\.(.+)")

INTERNAL_PARAMS = {
    'ajax': None,
    'callback': None,
    'dump': None,
    'export': None,
    'serp3_granny': None,
    'exp_flags': [re.compile('^template=.*$'), re.compile('^test_tool=.*$')],
    'foreverdata': None,
    'json_dump': None,
    'no_bolver': None,
    'no-tests': None,
    'rearr': None,
    'srcrwr': None,
    'srcskip': None,
    'test-id': re.compile('^1$'),
    'text': re.compile('^foreverdata$'),
    'tpid': None,
    'waitall': None,
}


def is_valid_record(row):
    req_path = row.get('request')

    # записи без урла или счетчиков нас не интересуют
    if not(req_path and row.get('blocks')):
        return False

    if not REQ_PATH_REG_EXP.match(req_path):
        return False

    # отсекаем куку, позволяющую использовать внутренние параметры
    if 'i-m-not-a-hacker' in row.get('cookies', ''):
        return False

    # отсекаем бота Директа, который проверяет на xss
    if 'YandexDirect' in row.get('user_agent', ''):
        return False

    # отсекаем записи с параметрами для дебага, скачивания, etc.
    query = urlparse.urlparse(req_path).query
    parsed_query = urlparse.parse_qs(query)

    for param_name, value_masks in INTERNAL_PARAMS.items():
        if param_name not in parsed_query:
            continue

        # если регулярки для значения нет, значит, отсекаем при любом значении параметра
        if not value_masks:
            return False

        # для удобства
        if type(value_masks) is not list:
            value_masks = [value_masks]

        # отсекаем, если одно из значений параметра подходит под одну из указанных регулярок
        for param_value in parsed_query[param_name]:
            for value_mask in value_masks:
                if value_mask.match(param_value):
                    return False

    return True


def is_valid_host(host):
    if PROD_YANDEX_HOST_REG_EXP.match(host):
        return True

    return False


def detect_platform(url):
    isPad = '/search/pad' in url or '/padsearch' in url
    isTouch = '/search/touch' in url or '/touchsearch' in url
    isSearchapp = isTouch and 'app_version=' in url

    if isPad:
        return 'pad'
    if isSearchapp:
        return 'searchapp'
    if isTouch:
        return 'touch'
    return 'desktop'


def detect_host_tld(host):
    return YANDEX_TLD_REG_EXP.findall(host)[0]


@yt.aggregator
@yt.with_context
class Mapper(object):
    def __init__(self, max_urls_per_tld, baobab_counters_list=[]):
        self.max_urls_per_tld = max_urls_per_tld
        self.baobab_matcher = BaobabMatcher(baobab_counters_list, max_urls_per_tld)

    def __call__(self, rows, context):
        data = {}

        for row in rows:
            if not is_valid_record(row):
                continue

            req_path = row.get('request')
            host = row.get('vhost')

            if not is_valid_host(host):
                continue

            tld = detect_host_tld(host)
            platform = detect_platform(req_path)

            factor = tld + ':' + platform

            counters = self.get_counters(row, factor)

            for counter in counters:
                if counter not in data:
                    data[counter] = {}

                if tld not in data[counter]:
                    data[counter][tld] = {
                        'desktop': set(),
                        'pad': set(),
                        'touch': set(),
                        'searchapp': set(),
                    }

                urls = data[counter][tld][platform]

                if len(urls) < self.max_urls_per_tld:
                    urls.add('https://' + host + req_path)

        for counter, tlds in data.iteritems():
            result = {'counter': counter}

            for tld, platforms in tlds.iteritems():
                for name, urls in platforms.iteritems():
                    if name in result:
                        result[name].extend(urls)
                    else:
                        # сеты не сериализуются
                        result[name] = list(urls)

            yield result

    def get_counters(self, row, factor):
        try:
            baobab_counters = self.get_baobab_counters(row, factor)
            legacy_counters = self.get_legacy_counters(row)
            return baobab_counters | legacy_counters
        except SkipEventException:
            return set()

    def get_legacy_counters(self, row):
        try:
            blocks = parse_blocks(row.get('blocks'))
        except BlocksValidationError:
            # Иногда в логах попадаются сломанные записи. Игнорировать такие записи - норма.
            return set()

        counters = map(lambda b: b.path, blocks)
        # оставляем уникальные, нам не интересны десять /web/item/title с разными pos
        return set(counters)

    def get_baobab_counters(self, row, factor):
        return self.baobab_matcher.get_keys(row.get('json_blocks'), factor)


@yt.with_context
class Reducer(object):
    def __init__(self, max_urls):
        self.max_urls = max_urls

    def __call__(self, key, rows, context):
        result = {'counter': key['counter']}

        for row in rows:
            for platform, urls in row.iteritems():
                if platform == 'counter':
                    continue

                res = result.get(platform, [])
                res = set(res)
                res |= uniq_urls(urls)
                res = list(res)

                result[platform] = res[:self.max_urls]

        yield result


@yt.with_context
class ReducerWithTldsCoverage(object):
    def __init__(self, max_urls, max_urls_per_tld):
        self.max_urls = max_urls
        self.max_urls_per_tld = max_urls_per_tld

    def __call__(self, key, rows, context):
        result = {'counter': key['counter']}

        for row in rows:
            for platform, urls in row.iteritems():
                if platform == 'counter':
                    continue

                tlds = {}

                platform_urls = result.get(platform, [])
                all_urls = platform_urls + list(urls)
                unique_urls = uniq_urls(all_urls)

                for url in unique_urls:
                    host = urlparse.urlparse(url).hostname
                    tld = detect_host_tld(host)

                    if tld not in tlds:
                        tlds[tld] = list()

                    if len(tlds[tld]) < self.max_urls_per_tld:
                        tlds[tld].append(url)

                result[platform] = functools.reduce(operator.concat, tlds.values(), [])
                result[platform] = result[platform][:self.max_urls]

        yield result


def run_map_reduce(max_urls, max_urls_per_tld, need_tlds_coverage, input_tables, output_table, baobab_counters_list=[]):
    reducer = ReducerWithTldsCoverage(max_urls, max_urls_per_tld) if need_tlds_coverage else Reducer(max_urls)

    return yt.run_map_reduce(Mapper(max_urls_per_tld, baobab_counters_list), reducer,
                             input_tables,
                             output_table,
                             reduce_by='counter',
                             spec={'data_size_per_map_job': 128 * 1024 * 1024},
                             sync=False)
