import hashlib
import gzip
import logging
import os
import time

from sandbox.sandboxsdk import process
from sandbox.sandboxsdk import parameters

from sandbox.projects.common import apihelpers
from sandbox.projects.common.mediasearch import ban as media_ban
from sandbox.projects.images.bans import resources as images_bans_resources


_BUFFER_SIZE = 4096


class BuildDetailedTrieParameter(parameters.SandboxBoolParameter):
    name = 'build_detailed_trie'
    description = 'Add detailed info to trie'
    default_value = True


class ReduceBanParameter(parameters.SandboxBoolParameter):
    name = 'reduce_ban'
    description = 'Reduce ban data using unused hosts'
    default_value = False


class ImagesReleaseAntipirateIndexBan(media_ban.ImagesBaseReleaseBanTask):
    """
        Builds antipirate ban resources for Yandex.Images service
    """

    type = "IMAGES_RELEASE_ANTIPIRATE_INDEX_BAN"

    client_tags = media_ban.ImagesBaseReleaseBanTask.client_tags
    cores = 1

    release_subject = "images/middle/antipirateindex-data-{timestamp}"
    release_comment = "daily images ban file (antipirate)"
    push_signal_name = "antipirate_index_ban"

    ban_resource = images_bans_resources.IMAGES_MIDDLESEARCH_ANTIPIRATE_URLS_BAN_FILE
    version_resource = images_bans_resources.IMAGES_MIDDLESEARCH_ANTIPIRATE_URLS_BAN_VERSION
    data_resource = images_bans_resources.IMAGES_MIDDLESEARCH_ANTIPIRATE_URLS_BAN_GRIMHOLD
    release_resources = (ban_resource, version_resource, data_resource)
    index_host_stat_resource = images_bans_resources.IMAGES_MIDDLESEARCH_ANTIPIRATE_INDEX_HOST_STAT
    input_parameters = (BuildDetailedTrieParameter, ReduceBanParameter, ) + \
        media_ban.ImagesBaseReleaseBanTask.create_input_parameters(enable_semaphore=True)

    def _build_ban(self):
        # download and uncompress gzimhold data
        grimhold_data = self.abs_path(self.data_resource.basename)
        grimhold_hasher = hashlib.md5()

        grimhold_resource = apihelpers.get_last_resource("IMAGE_INDEX_BAN_SAFESEARCH_DATA")
        process.run_process(["sky", "get", grimhold_resource.skynet_id], log_prefix="grimhold_sky_get")

        with open(grimhold_data, "r") as grimhold_data_file:
            while True:
                data = grimhold_data_file.read(_BUFFER_SIZE)
                if not data:
                    break
                grimhold_hasher.update(data)

        released_data = apihelpers.get_last_released_resource(self.data_resource)
        if not self.ctx.get(media_ban.ForceBuildParameter.name):
            if released_data and released_data.file_md5 == grimhold_hasher.hexdigest():
                logging.info("No new data found, exiting...")
                return 0

        self._register_ban(self.descr, grimhold_data, self.data_resource)

        # url2fastban
        index_path = self.abs_path("index.txt")
        hasher_tool = self._tool(images_bans_resources.IMAGES_URL2FASTBAN_EXECUTABLE)

        process.run_process([
            hasher_tool,
            "--mode", "index-lines",
            "--input", grimhold_data,
            "--output", index_path], outputs_to_one_file=False, log_prefix="hasher")

        if self.ctx.get(ReduceBanParameter.name, False):
            index_host_stat_data = self.abs_path(self.index_host_stat_resource.basename)
            index_host_stat_released = apihelpers.get_last_released_resource(self.index_host_stat_resource)
            process.run_process(["sky", "get", index_host_stat_released.skynet_id], log_prefix="index_host_stat_sky_get")

            reduced_path = self.abs_path("reduced.txt")
            ban_reduce_tool = self._tool(images_bans_resources.IMAGES_BAN_REDUCE_TOOL_EXECUTABLE)

            process.run_process([
                ban_reduce_tool,
                "BanReduce",
                "--input", index_path,
                "--index-host-stat", index_host_stat_data,
                "--output", reduced_path], outputs_to_one_file=False, log_prefix="ban_reduce")

            index_path = reduced_path

        # querydata_indexer
        trie_path = self.abs_path(self.ban_resource.basename)
        version_path = self.abs_path(self.version_resource.basename)
        indexer_tool = self._tool(images_bans_resources.IMAGES_QUERYDATAINDEXER_EXECUTABLE)

        if self.ctx.get(BuildDetailedTrieParameter.name, True):
            key_semantics = "exacturl,tld,ipregregion"
        else:
            key_semantics = "exacturl"

        process.run_process([
            indexer_tool,
            "--data-namespace", "localurlsban",
            "--key-semantics", key_semantics,
            "--local-input", index_path,
            "--output", trie_path], outputs_to_one_file=False, log_prefix="indexer")
        self._register_ban(self.descr, trie_path, self.ban_resource)

        with open(version_path, "w") as version_file:
            version_file.write(str(int(time.time())))
        self._register_ban(self.descr, version_path, self.version_resource)
        self._set_release_signal()

        return os.stat(trie_path).st_size

    def _test_ban(self, build_task_id):
        return [self._test_task(build_task_id, self.ban_resource)]


__Task__ = ImagesReleaseAntipirateIndexBan
