import itertools
import logging
import multiprocessing
import os.path
import urllib

from sandbox.common.types import client as ctc

from sandbox.sandboxsdk import parameters
from sandbox.sandboxsdk import sandboxapi
from sandbox.sandboxsdk import task

from sandbox.projects import resource_types
from sandbox.projects.common import string
from sandbox.projects.common import utils
from sandbox.projects.common.search import settings as media_settings
from sandbox.projects.common.mediasearch import user_requests
from sandbox.projects.common.mediasearch import yasm_task
from sandbox.projects.images.pumpkin import resources as images_pumpkin_resources


_EXECUTED_KEY = "_task_executed"


class ReportUrl(parameters.SandboxStringParameter):
    name = "report_url"
    description = "Url to load SERP of Images"
    default_value = "https://hamster.yandex.ru/images/search?"


class MinQueriesNumberParameter(parameters.SandboxIntegerParameter):
    name = 'min_queries'
    description = 'Mininum number of queries'
    default_value = 10000


class AddAttrs(parameters.SandboxStringParameter):
    name = 'resource_attrs'
    description = 'Set additional attrs to resources (ex.: attr1=v1, attr2=v2)'


class PoolSizeParameter(parameters.SandboxIntegerParameter):
    name = 'pool_size'
    description = 'Working pool size'
    default_value = 30


class ImagesGenerateCbirUserQueries(yasm_task.YasmTask, task.SandboxTask):
    """
        Generate user queries for CBIR

        Extract image urls from first shards and enchance original plan with 'url=' cgi parameters
    """

    type = "IMAGES_GENERATE_CBIR_USER_QUERIES"
    push_signal_name = "generate_cbir_user_queries"

    input_parameters = (
        ReportUrl,
        MinQueriesNumberParameter,
        AddAttrs,
        PoolSizeParameter,
    )

    # RMDEV-2374
    cores = 4
    ram = 63 * 1024  # 64 Gb
    execution_space = 4 * 1024  # 4 Gb
    client_tags = ctc.Tag.Group.LINUX & (ctc.Tag.GENERIC | ctc.Tag.MULTISLOT)

    def on_enqueue(self):
        task.SandboxTask.on_enqueue(self)
        self.create_resource(
            self.descr,
            self.__get_cbir_queries(),
            resource_types.USERS_QUERIES,
            attributes=string.parse_attrs(utils.get_or_default(self.ctx, AddAttrs)),
            arch=sandboxapi.ARCH_ANY,
        )

    def on_execute(self):
        # yasm_notify can raise TemporaryError, so we need to avoid a double work
        if _EXECUTED_KEY not in self.ctx:
            self.__on_execute()
            self.ctx[_EXECUTED_KEY] = 1

        self._yasm_notify()

    def __on_execute(self):
        pool = multiprocessing.Pool(utils.get_or_default(self.ctx, PoolSizeParameter))
        min_count = utils.get_or_default(self.ctx, MinQueriesNumberParameter)

        with open(self.__get_cbir_queries(), 'w') as cbir_queries_file:
            with open(self.__get_user_queries()) as user_queries_file:
                count = self.__generate_queries(
                    pool,
                    cbir_queries_file,
                    user_queries_file
                )

            if count >= min_count:
                return

            with open(self.__get_top_queries()) as top_queries_file:
                self.__generate_queries(
                    pool,
                    cbir_queries_file,
                    (line.split()[1] for line in top_queries_file),
                    max_count=min_count - count
                )

    def __generate_queries(self, pool, cbir_queries_file, queries_iterator, max_count=None):
        report_url = self.ctx[ReportUrl.name]
        queries_iterator = itertools.izip(itertools.repeat(report_url), queries_iterator)
        count = 0
        for output in pool.imap_unordered(_process_query_line, queries_iterator):
            if output is not None:
                cbir_queries_file.write(output + "\n")
                count += 1
            if max_count is not None and count > max_count:
                break
        return count

    def __get_cbir_queries(self):
        return self.abs_path("cbir-queries.txt")

    def __get_user_queries(self):
        user_queries_resource_id = utils.get_and_check_last_resource_with_attribute(
            resource_types.USERS_QUERIES,
            media_settings.ImagesSettings.QUERIES_ATTR_NAME,
            media_settings.ImagesSettings.QUERIES_TYPE_SERP
        ).id
        return self.sync_resource(user_queries_resource_id)

    def __get_top_queries(self):
        pumpkin_attributes = media_settings.ImagesSettings.pumpkin_resource_attributes(user_requests.DEFAULT_TLD)
        pumpkin_resource_id = utils.get_and_check_last_resource_with_attribute(
            images_pumpkin_resources.IMAGES_PUMPKIN_INDEX,
            pumpkin_attributes[0],
            pumpkin_attributes[1]
        ).id
        return os.path.join(self.sync_resource(pumpkin_resource_id), "topqueries.txt")


# TODO: generate multiple output strings with different urls to minimize problems with avatarnica, 404 etc.
def _process_query_line(args):
    report_url, line = args
    query_text, query_region, query_cgi_params, query_tld = user_requests.split_query_line(line)
    url = report_url.format(query_tld)

    for image_url in user_requests.get_image_urls(url, query_text, query_cgi_params):
        logging.info("Adding [{}] to plan".format(line))
        return user_requests.join_query_line(
            query_text,
            query_region,
            "{}&url={}".format(query_cgi_params, urllib.quote(image_url, safe='')),
            query_tld
        )
    else:
        logging.info("Nothing found for [{}]".format(line))


__Task__ = ImagesGenerateCbirUserQueries
