# -*- coding: utf-8 -*-

import os.path
import logging
import urllib
import random

from sandbox import common
import sandbox.common.types.client as ctc

from sandbox.common import errors
from sandbox.sandboxsdk import paths
from sandbox.projects import resource_types
from sandbox.sandboxsdk.channel import channel
from sandbox.sandboxsdk.task import SandboxTask
from sandbox.sandboxsdk.parameters import SandboxSelectParameter
from sandbox.sandboxsdk.parameters import SandboxStringParameter
from sandbox.sandboxsdk.parameters import SandboxIntegerParameter
from sandbox.projects.common.cms import get_cms_resource
from sandbox.sandboxsdk.sandboxapi import ARCH_ANY
from sandbox.projects.common.dolbilka import convert_queries_to_plan
from sandbox.projects.common.search.eventlog import eventlog
from sandbox.projects.common.search.response import cgi as search_cgi
from sandbox.projects.common.search.settings import VLA_REGION
from sandbox.projects.common.search.settings import MAN_REGION
from sandbox.projects.common.search.settings import SAS_REGION
from sandbox.projects.common.search.settings import INDEX_MAIN
from sandbox.projects.common.search.settings import INDEX_ALL
from sandbox.projects.common.search.settings import VideoSettings
from sandbox.projects.common import utils
from sandbox.projects.common import string


_COLLECTION_ALL = 'all'
"""
    Псевдо-имя означающее все индексы
"""

_FULL_COLLECTION_LIST = (
    _COLLECTION_ALL,
    VideoSettings.COLLECTION_MAIN,
)
"""
    Полный список обрабатываемых коллекций
"""

_CONFIG_RESOURCE_ID = 'out_config_resource_id'
"""
    Ключ в контексте для идентификатора ресурса
    конфигурационного файла
"""

_DATA_RESOURCE_ID = 'out_data_resource_id'
"""
    Ключ в контексте для идентификатора ресурса
    данных переранжирования
"""

_QUERIES_RESOURCE_ID = 'out_%s_queries_resource_id'
"""
    Ключ в контексте для идентификатора ресурса
    запросов
"""

_PLAN_RESOURCE_ID = 'out_%s_plan_resource_id'
"""
    Ключ в контексте для идентификатора ресурса
    плана
"""

_DEFAULT_ATTRIBUTES = 'debug=yes'
"""
    Атрибуты по умолчанию для загружаемых ресурсов
"""


class RegionParameter(SandboxSelectParameter):
    """
        Регион
    """

    name = 'region'
    description = 'Region'
    choices = [(VLA_REGION, VLA_REGION), (SAS_REGION, SAS_REGION)]
    default_value = VLA_REGION


class MiddleTypeParameter(SandboxSelectParameter):
    """
        Тип среднего
    """

    name = 'middle_type'
    description = 'Middle type'
    choices = [('Mmeta', VideoSettings.MMETA_ID), ('Int', VideoSettings.INT_ID)]
    default_value = VideoSettings.MMETA_ID


class MetaLogName:
    CURRENT = 'current-eventlog-%META_TYPE%-%PORT%'
    PREVIOUS = 'eventlog-%META_TYPE%-%PORT%.PREV'


class LogNameParameter(SandboxStringParameter):
    """
        Имя файла с логом промежуточного поиска (инт или средний)
    """

    name = 'log_name'
    description = 'Mmeta/Int log name (blank for default)'


class RequestCountParameter(SandboxIntegerParameter):
    """
        Количество записей которые мы берём одного лога
    """

    name = 'request_count'
    description = 'Request count'
    default_value = 10000


class AllQueriesAttrsParameter(SandboxStringParameter):
    """
        Какие атрибуты следует добавить к создаваемому ресурсу
        с всеми запросами
    """

    name = 'all_queries_attributes'
    description = 'Create and set attrs to all queries (e.g. attr1=v1, attr2=v2)'
    default_value = _DEFAULT_ATTRIBUTES


class MainQueriesAttrsParameter(SandboxStringParameter):
    """
        Какие атрибуты следует добавить к создаваемому ресурсу
        с запросами к главному поиску
    """

    name = 'main_queries_attributes'
    description = 'Create and set attrs to main queries (e.g. attr1=v1, attr2=v2)'
    default_value = _DEFAULT_ATTRIBUTES


class ConfigAttrsParameter(SandboxStringParameter):
    """
        Какие атрибуты следует добавить к создаваемому ресурсу
        конфигурационного файла
    """

    name = 'config_attributes'
    description = 'Create and set attrs to config (e.g. attr1=v1, attr2=v2)'
    default_value = _DEFAULT_ATTRIBUTES


class DataAttrsParameter(SandboxStringParameter):
    """
        Какие атрибуты следует добавить к создаваемому ресурсу
        данных переранжирования
    """

    name = 'data_attributes'
    description = 'Create and set attrs to data (e.g. attr1=v1, attr2=v2)'
    default_value = _DEFAULT_ATTRIBUTES


class VideoLoadMiddlesearchResources(SandboxTask):
    """
        Задача по загрузке запросов и конфигурационных файлов
        со среднего поиска из production
        и формированию из них ресурсов для Sandbox
    """

    type = 'VIDEO_LOAD_MIDDLESEARCH_RESOURCES'
    client_tags = ctc.Tag.IPV4

    input_parameters = (
        MiddleTypeParameter,
        RegionParameter,
        LogNameParameter,
        RequestCountParameter,
        AllQueriesAttrsParameter,
        MainQueriesAttrsParameter,
        ConfigAttrsParameter,
        DataAttrsParameter,
    )

    def _create_queries_resources(self, collection_name, attributes):
        """
            Создание ресурсов плана и запросов для указанной коллекции
            и сохранение идентификаторов этих ресурсов в контексте задачи
        """

        description = '%s, queries to collection \'%s\'' % (self.descr, collection_name)

        queries_resource = self.create_resource(
            description,
            'mmeta-%s.queries.txt' % collection_name,
            resource_types.VIDEO_MIDDLESEARCH_PLAIN_TEXT_REQUESTS,
            arch=ARCH_ANY,
            attributes=attributes
        )
        plan_resource = self.create_resource(
            description,
            'mmeta-%s.plan.bin' % collection_name,
            resource_types.VIDEO_MIDDLESEARCH_PLAN,
            arch=ARCH_ANY,
            attributes=attributes
        )

        self.ctx[_QUERIES_RESOURCE_ID % collection_name] = queries_resource.id
        self.ctx[_PLAN_RESOURCE_ID % collection_name] = plan_resource.id

    def on_execute(self):
        middle_type = self.ctx[MiddleTypeParameter.name]
        region = self.ctx[RegionParameter.name]

        config_attributes = self.ctx[ConfigAttrsParameter.name]
        if config_attributes:
            config_resource = self.create_resource(
                self.descr,
                '%s.cfg' % middle_type,
                resource_types.VIDEO_MIDDLESEARCH_CONFIG,
                arch=ARCH_ANY,
                attributes=string.parse_attrs(config_attributes)
            )
            self.ctx[_CONFIG_RESOURCE_ID] = config_resource.id

        data_attributes = self.ctx[DataAttrsParameter.name]
        if data_attributes:
            if middle_type == VideoSettings.INT_ID:
                raise errors.TaskFailure('No rearrange data on int available')
            data_resource = self.create_resource(
                self.descr,
                'mmeta.data',
                resource_types.VIDEO_MIDDLESEARCH_DATA,
                arch=ARCH_ANY,
                attributes=string.parse_attrs(data_attributes)
            )
            self.ctx[_DATA_RESOURCE_ID] = data_resource.id

        nany_client = VideoSettings.get_nanny(VideoSettings.get_nanny_token(self))
        all_attributes = self.ctx[AllQueriesAttrsParameter.name]
        if all_attributes:
            all_attributes_dict = string.parse_attrs(all_attributes)
            cms_configuration, instance_tag_name = VideoSettings.basesearch_tag(
                INDEX_ALL, region, VideoSettings.get_nanny_token(self)
            )
            self._create_queries_resources(
                _COLLECTION_ALL,
                all_attributes_dict
            )

        main_attributes = self.ctx[MainQueriesAttrsParameter.name]
        if main_attributes:
            main_attributes_dict = string.parse_attrs(main_attributes)
            cms_configuration, instance_tag_name = VideoSettings.basesearch_tag(
                INDEX_MAIN, region, VideoSettings.get_nanny_token(self)
            )
            self._create_queries_resources(
                VideoSettings.COLLECTION_MAIN,
                main_attributes_dict
            )

        cms_configuration, instance_tag_name = VideoSettings.middlesearch_tag(
            middle_type, region, VideoSettings.get_nanny_token(self)
        )
        service_name = VideoSettings.MMETA_NANNY_SERVICE_NAME_VLA
        if middle_type == VideoSettings.MMETA_ID and region == MAN_REGION:
            service_name = VideoSettings.MMETA_NANNY_SERVICE_NAME_MAN
        elif middle_type != VideoSettings.MMETA_ID:
            service_name = VideoSettings.INT_NANNY_SERVICE_NAME
        live_middle_instances = self._get_instances(nany_client, service_name, middle_type)

        middle_instance = live_middle_instances[0]
        logging.info('Using instance %s:%s', middle_instance)

        if _CONFIG_RESOURCE_ID in self.ctx:
            self._load_config(
                middle_instance,
                middle_type,
                self.ctx[_CONFIG_RESOURCE_ID]
            )

        if _DATA_RESOURCE_ID in self.ctx:
            cms_configuration, instance_tag_name = VideoSettings.BAN_TRIE
            self._load_data(
                cms_configuration,
                instance_tag_name,
                self.ctx[_DATA_RESOURCE_ID]
            )

        for collection_name in _FULL_COLLECTION_LIST:
            logging.info('getting queries from %s', collection_name)
            queries_resource_id = _QUERIES_RESOURCE_ID % collection_name
            plan_resource_id = _PLAN_RESOURCE_ID % collection_name
            if queries_resource_id not in self.ctx:
                logging.info('skip %s', queries_resource_id)
                continue

            retries = 0
            max_retries = min(25, len(live_middle_instances))
            request_count = self.ctx[RequestCountParameter.name]
            got_reqs = 0
            while retries < max_retries:
                logging.info('%s retry', retries)
                try:
                    got_reqs += self._load_queries(
                        live_middle_instances[retries],
                        middle_type,
                        collection_name,
                        self.ctx[queries_resource_id],
                        request_count
                    )
                    if got_reqs >= request_count:
                        break
                except errors.TaskFailure as e:
                    logging.error("Failed to load queries: %s", e)
                    retries += 1

            if retries >= max_retries:
                raise errors.TaskFailure(
                    'Failed to acquire queries for collection {} after {} retries.'.format(collection_name, retries)
                )

            queries_path = channel.sandbox.get_resource(self.ctx[queries_resource_id]).path
            plan_path = channel.sandbox.get_resource(self.ctx[plan_resource_id]).path

            if os.path.getsize(queries_path) < 16 * 1024:
                raise errors.TaskFailure(
                    'Queries from host %s:%s, collection %s have very small size %s' % (
                        middle_instance[0],
                        middle_instance[1],
                        collection_name,
                        os.path.getsize(queries_path)
                    )
                )

            convert_queries_to_plan(queries_path, plan_path)
            if os.path.getsize(plan_path) < 16 * 1024:
                raise errors.TaskFailure('Plans have very small size %s' % os.path.getsize(queries_path))

            # mark resources as ready
            self.mark_resource_ready(self.ctx[plan_resource_id])
            self.mark_resource_ready(self.ctx[queries_resource_id])

    def _get_instances(self, nany_client, service_name, middle_type):
        """
            Возвращает порт и хост рабочего экземпляра промежуточного поиска
        """

        def ping_instance(host_and_port):
            try:
                logging.info('Pinging instance %s:%s', *host_and_port)
                urllib.urlopen('http://{}:{}/yandsearch'.format(*host_and_port))
                return True
            except IOError:
                logging.info('Pinging instance %s:%s failed', *host_and_port)
                return False

        instances = nany_client.get_service_current_instances(service_name)
        instances = instances['result']
        logging.info('Received instances from nanny: %s', instances)

        hosts = [
            (instance['hostname'], int(instance['port']), instance['container_hostname']) for instance in instances
        ]

        if not hosts:
            raise errors.TaskFailure("No hosts resolved for service '%s'" % service_name)
        sample_size = 25
        if len(hosts) > sample_size:
            hosts = random.sample(hosts, sample_size)
        live_hosts = [
            h for h in hosts
            if ping_instance((h[0] if middle_type == VideoSettings.MMETA_ID else h[2], h[1]))
        ]
        if not live_hosts:
            raise errors.TaskFailure("Could not find any working instance for service '%s'" % service_name)

        return live_hosts

    def _load_config(self, middle_instance, middle_type, config_resource_id):
        """
            Загрузка конфигурационного файла
        """

        config_path = channel.sandbox.get_resource(config_resource_id).path
        host = middle_instance[0] if middle_type == VideoSettings.MMETA_ID else middle_instance[2]
        utils.get_config_by_info_request(host, middle_instance[1], config_path)

    def _load_data(self, cms_configuration, instance_tag_name, data_resource_id):
        """
            Загрузка данных переранжирования
        """

        data_path = channel.sandbox.get_resource(data_resource_id).path

        rearrange_path = os.path.join(data_path, 'rearrange')
        pure_path = os.path.join(data_path, 'pure')

        paths.make_folder(rearrange_path, delete_content=True)
        paths.make_folder(pure_path, delete_content=True)

        tmp_data_path = self.abs_path('tmp.data')
        os.symlink('.', os.path.join(rearrange_path, 'tries'))

        # Merge additional resources (player-whitelist, etc.)
        for resource_type in VideoSettings.MIDDLE_DATA_SUBRESOURCES:
            try:
                # Attempt to get resource from production release
                resource_path = utils.sync_last_stable_resource(resource_type)
                paths.copy_path(resource_path, os.path.join(rearrange_path, os.path.basename(resource_path)))
            except errors.TaskError as e:
                logging.warning(e)

                # Attempt to manually download resource from remote
                cms_resource = get_cms_resource(
                    resource_type.basename, cms_configuration=cms_configuration, instance_tag_name=instance_tag_name)
                if not cms_resource:
                    raise errors.TaskFailure('Failed to find {} resource in CMS'.format(resource_type.basename))

                common.share.skynet_get(cms_resource['remote_path'], tmp_data_path)
                for file_object in common.share.files_torrent(cms_resource['remote_path']):
                    file_path = file_object['name']
                    file_name = os.path.basename(file_path)
                    os.rename(os.path.join(tmp_data_path, file_path), os.path.join(rearrange_path, file_name))

    def _load_queries(self, middle_instance, middle_type, collection_name, queries_resource_id, request_count):
        """
            Загрузка запросов
        """

        logging.info('queries resource id %s', queries_resource_id)
        queries_path = channel.sandbox.get_resource(queries_resource_id).path
        logging.info('queries path %s', queries_path)

        host, port, container_host = middle_instance

        if self.ctx[LogNameParameter.name]:
            log_names = [self.ctx[LogNameParameter.name]]
        else:
            log_names = [MetaLogName.CURRENT, MetaLogName.PREVIOUS]

        log_names = map(lambda x: x.replace('%META_TYPE%', middle_type).replace('%PORT%', str(port)), log_names)
        assert log_names

        log_path = self.path('eventlog.txt')
        paths.remove_path(log_path)

        utils.receive_skynet_key(self.owner)
        res = eventlog.get_eventlogs(
            log_names,
            [host],
            log_path,
            port=port,
            compress=False,
            event_ids=[286, 287],  # FastCacheHit, ContextCreated
            events_filter={'max': request_count, 'min': request_count, 'collection': True, 'fields': [0]},
            collection_filter=collection_name if collection_name != _COLLECTION_ALL else None,
            user='prod-resource-getter',
        )

        if not res or not any(res.itervalues()):
            raise errors.TaskFailure(
                'Cannot load queries from host %s:%s, collection %s' % (middle_instance[0],
                                                                        middle_instance[1],
                                                                        collection_name))

        got_reqs = 0
        with open(log_path, 'r') as log_file:
            with open(queries_path, 'a') as queries_file:
                for line in log_file:
                    if line.find('ag0=vhubmix') >= 0 or line.find('&dh=') >= 0 or line.find('&user_request=&') >= 0 or line.find('yandsearch') < 0:
                        continue
                    parts = line.split('\t')
                    query = search_cgi.remove_supermind_controls('/%s?%s' % (parts[0], parts[1]))
                    query = search_cgi.remove_db_timestamp(query)
                    query_parts = query.split('?')
                    # We don't want to have empty queries in our tests
                    if (len(query_parts) < 2) or (len(query_parts[1]) == 0):
                        continue
                    queries_file.write(query)
                    got_reqs += 1

        logging.info('Got %s queries', got_reqs)
        return got_reqs
