# -*- coding: utf-8 -*-

import re

from sandbox.projects import resource_types
from sandbox.sandboxsdk import parameters
from sandbox.sandboxsdk.task import SandboxTask
from sandbox.sandboxsdk.process import run_process
from sandbox.sandboxsdk.channel import channel
from sandbox.sandboxsdk.errors import SandboxTaskUnknownError

from sandbox.projects.common.config_processor import get_parameter_values
from sandbox.projects.common.search.eventlog import load_eventlog
from sandbox.projects.images.metasearch import resources as images_metasearch_resources


MAIN_COLLECTION_PATH = r'Collection(?:[^/]+)?\s+id="yandsearch"(?:\s+[^/]+)?/SearchSource'
SERVER_DESCR_PARAMETER_PATH = re.compile(MAIN_COLLECTION_PATH + '/ServerDescr')
CGI_SEARCH_PREFIX_PARAMETER_PATH = re.compile(MAIN_COLLECTION_PATH + '/CgiSearchPrefix')


class MiddleConfigParameter(parameters.ResourceSelector):
    name = 'middle_config_resource_id'
    description = 'middlesearch config'
    resource_type = ['MIDDLESEARCH_CONFIG', images_metasearch_resources.IMAGES_MIDDLESEARCH_CONFIG, 'VIDEO_MIDDLESEARCH_CONFIG']
    required = True


class SourceNameParameter(parameters.SandboxStringParameter):
    name = 'source_name'
    description = 'Source name'
    default_value = 'WEB'


class EvlogdumpExecutableParameter(parameters.ResourceSelector):
    name = 'evlogdump_executable_resource_id'
    description = 'evlogdump executable'
    resource_type = 'EVLOGDUMP_EXECUTABLE'
    required = True


class EventlogParameter(parameters.ResourceSelector):
    name = 'eventlog_resource_id'
    description = 'eventlog'
    resource_type = 'EVENTLOG_DUMP'
    required = True


class SaveRawTxtEventlogParameter(parameters.SandboxBoolParameter):
    name = 'save_raw_txt_eventlog'
    description = 'save raw txt eventlog'


class PatchQueries(parameters.SandboxBoolParameter):
    name = 'patch_queries'
    description = 'Remove source number from snippet queries'


class RequiredMaxLength(parameters.SandboxIntegerParameter):
    name = 'required_max_length'
    description = 'Required max length of query for basesearch'
    default_value = 17500


class GetQueriesFromEventlog(SandboxTask):
    """
        Достает из eventlog'а запросы к первому WEB базовому
    """
    type = 'GET_QUERIES_FROM_EVENTLOG'

    execution_space = 100000

    input_parameters = (
        PatchQueries,
        MiddleConfigParameter,
        SourceNameParameter,
        EvlogdumpExecutableParameter,
        EventlogParameter,
        RequiredMaxLength,
        SaveRawTxtEventlogParameter,
    )

    def on_enqueue(self):
        SandboxTask.on_enqueue(self)
        channel.task = self
        self.ctx['out_resource_id'] = self.create_resource(
            self.descr,
            'queries.txt',
            'PLAIN_TEXT_QUERIES',
            arch='any'
        ).id

    def on_execute(self):
        config_path = self.sync_resource(self.ctx[MiddleConfigParameter.name])
        source_name = self.ctx.get(SourceNameParameter.name, SourceNameParameter.default_value)

        evlogdump_executable_path = self.sync_resource(self.ctx[EvlogdumpExecutableParameter.name])
        eventlog_path = self.sync_resource(self.ctx[EventlogParameter.name])

        hosts = self._parse_config(config_path, source_name)

        evlogdump_txt_file_name = self.abs_path('evlogdump.txt')
        cmd = [evlogdump_executable_path, '-o', '-i', '300', eventlog_path]  # 300=SubSourceRequest
        with open(evlogdump_txt_file_name, "w") as f:
            run_process(cmd, stdout=f, log_prefix='evlogdump', wait=True, outputs_to_one_file=False)

        if self.ctx[SaveRawTxtEventlogParameter.name]:
            r = self.create_resource(
                'evlogdump in txt format',
                evlogdump_txt_file_name,
                resource_types.OTHER_RESOURCE
            )
            self.mark_resource_ready(r)

        queries = load_eventlog.get_requests_to_hosts(evlogdump_txt_file_name, [(host, None) for host in hosts])

        if len(queries) == 0:
            raise SandboxTaskUnknownError('cannot get queries from eventlog')

        if self.ctx[PatchQueries.name]:
            queries = patch_queries(queries)

        self.ctx['num_of_queries'] = len(queries)
        self.ctx['avg_query_length'] = sum((len(q) for q in queries)) / len(queries)
        self.ctx['max_query_length'] = max((len(q) for q in queries))
        self.ctx['query_length_too_big_per_cent'] = 100.0*sum(
            [1 for q in queries if len(q) > self.ctx.get(RequiredMaxLength.name, self.ctx['max_query_length'])]
        ) / len(queries)

        out_resource = channel.sandbox.get_resource(self.ctx['out_resource_id'])
        with open(out_resource.path, 'w') as file:
            file.write('\n'.join(queries))

    def _parse_config(self, config_path, source_name):
        config = open(config_path).read()
        server_descr_list = get_parameter_values(config, SERVER_DESCR_PARAMETER_PATH)
        cgi_search_prefix_list = get_parameter_values(config, CGI_SEARCH_PREFIX_PARAMETER_PATH)

        if len(server_descr_list) != len(cgi_search_prefix_list):
            raise SandboxTaskUnknownError("cannot parse config")

        for descr, cgi_search_prefix in zip(server_descr_list, cgi_search_prefix_list):
            if descr == source_name:
                return re.findall('http://([^:]+):', cgi_search_prefix)

        raise SandboxTaskUnknownError("cannot parse config")


def patch_queries(queries):
    patched_queries = []
    template = 'dh='
    for line in queries:
        new_line = ''
        while template in line:
            new_line += line[:line.find(template)+len(template)]
            line = line[line.find(template)+len(template):]
            line = line[line.find('-')+1:]

        new_line += line
        patched_queries.append(new_line)

    return patched_queries


__Task__ = GetQueriesFromEventlog
