# -*- coding: utf-8 -*-

import itertools
import logging
import os
import random
import subprocess

from sandbox import common
import sandbox.common.types.client as ctc

from sandbox.projects import resource_types
from sandbox.sandboxsdk import parameters, process
from sandbox.sandboxsdk.channel import channel
from sandbox.sandboxsdk.task import SandboxTask

from sandbox.projects.common.base_search_quality import threadPool

from sandbox.projects.common import apihelpers

from kernel.util.functional import memoized


class DatabaseParameter(parameters.ResourceSelector):
    name = 'database_resource_id'
    description = 'Database:'
    resource_type = [resource_types.IMAGES_SEARCH_DATABASE,
                      resource_types.VIDEO_SEARCH_DATABASE,
                      resource_types.VIDEO_QUICK_SEARCH_DATABASE,
                      resource_types.VIDEO_ULTRA_SEARCH_DATABASE]

    @common.utils.classproperty
    def default_value(cls):
        resource = apihelpers.get_last_resource(cls.resource_type)
        return resource.id if resource else None


class RequestLimitParameter(parameters.SandboxIntegerParameter):
    name = 'request_limit'
    description = 'Unique requests number upper limit:'
    default_value = 100000


IMPROXY_TYPE_IMAGES = 'images'
IMPROXY_TYPE_VIDEO = 'video'
IMPROXY_TYPE_CUSTOM = 'custom'


class HostListParameter(parameters.SandboxStringParameter):
    name = 'host_list'
    separator = ','
    description = 'Hosts, %s-separated:' % separator


class ImproxyTypeParameter(parameters.SandboxStringParameter):
    name = 'improxy_type'
    description = 'Improxy type:'
    choices = [(x, x) for x in (IMPROXY_TYPE_IMAGES, IMPROXY_TYPE_VIDEO, IMPROXY_TYPE_CUSTOM)]
    sub_fields = {
        IMPROXY_TYPE_CUSTOM: [HostListParameter.name],
    }
    default_value = IMPROXY_TYPE_IMAGES


class DuplicateAndShuffleParameter(parameters.SandboxBoolParameter):
    name = 'duplicate_and_shuffle'
    description = 'Duplicate and shuffle requests for cache tests'
    default_value = False


class ProtobufParameter(parameters.SandboxBoolParameter):
    name = 'protobuf_data'
    description = 'Storage consists protobuf data'
    default_value = False


class ThreadParams(object):
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)


class ImproxyGenerateRequests(SandboxTask):
    """
        Дампает indexarc картиночного индекса, достает оттуда id тумбнейлов
        и составляет запросы с ними к тумбнейлерной проксе
    """
    type = 'IMPROXY_GENERATE_REQUESTS'
    input_parameters = (DatabaseParameter, RequestLimitParameter, ImproxyTypeParameter, HostListParameter,
                        DuplicateAndShuffleParameter, ProtobufParameter)

    execution_space = 60000

    client_tags = ctc.Tag.Group.LINUX

    def get_result_path(self):
        return self.abs_path('improxy.requests')

    def on_enqueue(self):
        SandboxTask.on_enqueue(self)

        self._create_resource(
            self.descr,
            self.get_result_path(),
            resource_types.IMPROXY_PLAIN_TEXT_REQUESTS,
            arch="any")

    def choose_request_location_prefix(self):
        if self.ctx['improxy_type'] == IMPROXY_TYPE_IMAGES:
            return random.choice(["im%d" % i for i in xrange(0, 4)])
        else:
            return "video"

    @memoized
    def custom_hosts(self):
        assert self.ctx[ImproxyTypeParameter.name] == IMPROXY_TYPE_CUSTOM
        return self.ctx[HostListParameter.name].split(HostListParameter.separator)

    def choose_request_host(self):
        if self.ctx[ImproxyTypeParameter.name] == IMPROXY_TYPE_CUSTOM:
            return random.choice(self.custom_hosts())
        return "%s-tub-%s.yandex.net" % (self.choose_request_location_prefix(),
                                         random.choice(self.LOCATION_SUFFIXES))

    LOCATION_SUFFIXES = ["ru", "com", "tr", "kz", "by", "ua"]

    def on_execute(self):
        self.sync_resource(self.ctx[DatabaseParameter.name])
        thread_count = int(self.client_info['ncpu']) // 2
        database_resource = channel.sandbox.get_resource(self.ctx[DatabaseParameter.name])
        random.seed(database_resource.file_md5)
        database_path = database_resource.path
        index_path = os.path.join(database_path, 'index')
        tarcview_resource_id = apihelpers.get_last_resource(resource_types.IMAGES_TARCVIEW_EXECUTABLE).id
        tarcview_path = self.sync_resource(tarcview_resource_id)

        def extract_json(pipe_out):
            import re
            import json
            import random

            regex = re.compile(r'^\w+\t({".+?":{.+?}})$')
            for line in pipe_out:
                regex_result = regex.search(line)
                if regex_result:
                    try:
                        entity = regex_result.group(1)
                        json_data = json.loads(entity)
                        if 'BigThumb' in json_data['ImageAttrs'] and random.randint(1, 2) == 1:
                            yield json_data['ImageAttrs']['BigThumb']['Id']
                        yield json_data['ImageAttrs']['Thumb']['Id']
                    except KeyError:
                        logging.info("Bad doc in protobuf")
                        pass

        def extract_csv(pipe_out):
            thumb_id_marker = "\ti\t"
            for line in pipe_out:
                pos = line.find(thumb_id_marker)
                if pos != -1:
                    pos += len(thumb_id_marker)
                    yield line[pos:line.find("\t", pos + 1)]

        def extract_csv_video_non_taas(pipe_out):
            import json
            thattr_marker = "thattrs"
            for line in pipe_out:
                if line.split('=')[0] == thattr_marker:
                    json_data = json.loads(line.split('=')[1])
                    if "Storage" in json_data:
                        if json_data["Storage"] == "taas":
                            continue
                    if "ThumbBackendId" in json_data:
                        yield json_data["ThumbBackendId"]

        # Для быстроты дампаем архив через пул потоков, нарезая каждому из N потоков 1/N-ю документов
        def dump_function(chunk, params):
            result = set()
            if self.ctx[ProtobufParameter.name]:
                extract_id_func = extract_json
            elif self.ctx[ImproxyTypeParameter.name] == IMPROXY_TYPE_VIDEO:
                extract_id_func = extract_csv_video_non_taas
            else:
                extract_id_func = extract_csv
            for docid_range in chunk:
                command = params.command_base + [str(docid_range[0]), str(docid_range[1])]
                dump_process = process.run_process(command, check=True, wait=False, stdout=subprocess.PIPE)
                for thumb_id in extract_id_func(dump_process.stdout):
                    result.add(thumb_id)
                    if len(result) >= params.per_thread_limit:
                        dump_process.kill()
                        break
            logging.info("Extracted %d thumbnail ids, thread finished" % len(result))
            return result

        params = ThreadParams(command_base=[tarcview_path, '-ae', index_path],
                              # Дампаем с запасом, на случай, если какие-то потоки вернут меньше результатов
                              per_thread_limit=int(self.ctx['request_limit'] / thread_count * 1.2))

        doc_count = os.stat(os.path.join(database_path, 'indexdir')).st_size / 8
        range_size = doc_count / thread_count
        ranges = [(i * range_size, (i + 1) * range_size) for i in range(thread_count)]
        result_chunks = threadPool.process_data(dump_function, ranges, params,
                                                use_processes=True, process_count=thread_count)
        thumb_ids = set()
        for chunk in result_chunks:
            thumb_ids.update(chunk)
        if len(thumb_ids) > self.ctx['request_limit']:
            thumb_ids = list(itertools.islice(thumb_ids, self.ctx['request_limit']))
        else:
            thumb_ids = list(thumb_ids)
        self.set_info("Total number of extracted unique thumbnail ids: %d" % len(thumb_ids))
        if self.ctx[DuplicateAndShuffleParameter.name]:
            thumb_ids.extend(list(thumb_ids))
            random.shuffle(thumb_ids)

        with open(self.get_result_path(), 'w') as result_file:
            for thumb_id in thumb_ids:
                result_file.write("http://%s/i?id=%s&n=21\n" % (self.choose_request_host(), thumb_id))


__Task__ = ImproxyGenerateRequests
