import os
import os.path
import logging
import shutil

import sandbox.common.types.client as ctc

from sandbox.projects import resource_types
from sandbox.projects.common.apihelpers import get_task_resource_id, get_last_resource_with_attribute
from sandbox.projects.common.build.ArcadiaTask import ArcadiaTask
from sandbox.projects.querysearch_saas.prepare_saas_querysearch_data import PrepareSaasQuerysearchData
from sandbox.projects.common.utils import check_subtasks_fails, get_or_default
from sandbox.projects.geosearch import resource_types as grt
from sandbox.projects.common.vcs import arc

from sandbox.projects.BuildAddrSnippetDataRequester import BuildAddrSnippetDataRequester
from sandbox.projects.GenerateLinearModelBinaryDump import get_latest_resource
from sandbox.projects.PSUtil import print_folder_tree

from sandbox import sdk2
from sandbox.sandboxsdk import parameters, environments
from sandbox.sandboxsdk.channel import channel
from sandbox.sandboxsdk.errors import SandboxTaskFailureError
from sandbox.sandboxsdk.paths import get_logs_folder
from sandbox.sandboxsdk.process import run_process


class ADDRESS_SNIPPET_TSV_EXPORT(sdk2.resource.AbstractResource):
    auto_backup = True


class PrepareSaasQuerydata(parameters.SandboxBoolParameter):
    name = 'prepare_saas'
    description = 'Prepare releasable data for querysearch on YT'
    default_value = False


class UpdateSaasQuerydataProd(parameters.SandboxBoolParameter):
    name = 'update_saas'
    description = 'Release data for querysearch to saas (PRODUCTION)'
    default_value = False


class UpdateSaasQuerydataTesting(parameters.SandboxBoolParameter):
    name = 'update_saas_testing'
    description = 'Release data for querysearch to saas (testing)'
    default_value = False


class BuildAddrSnippetData(ArcadiaTask, PrepareSaasQuerysearchData):

    class ItemsProcessLimit(parameters.SandboxIntegerParameter):
        name = 'items_limit'
        description = 'Process only this many items from backa export (intended for debug purposes, 0 = all)'
        default_value = 0
        required = False

    class UseAggregatorsFile(parameters.SandboxBoolParameter):
        name = 'use_aggregators'
        description = 'Ban hosts by aggregators list'
        default_value = True
        required = True

    class WorkersNum(parameters.SandboxIntegerParameter):
        name = 'workers_num'
        description = 'Number of workers to run in parallel'
        default_value = 10
        required = True

    class WorkerTimeout(parameters.SandboxIntegerParameter):
        name = 'workers_timeout'
        description = 'Worker timeout (in hours)'
        default_value = 24
        required = True

    class SourceTask(parameters.TaskSelector):
        name = 'get_child_tasks_from'
        description = 'Task to take child results from'
        required = False
        task_type = "BUILD_ADDR_SNIPPET_DATA"

    class LatestLinkName(parameters.SandboxStringParameter):
        name = 'latest_link'
        description = 'name for link to latest table, do not create link if empty'

    class MakeAddrsnipDataExecutable(parameters.LastReleasedResource):
        name = 'make_addrsnip_data_executable'
        description = 'make_addrsnip_data executable'
        resource_type = grt.MAKE_ADDRSNIP_DATA_EXECUTABLE

    class AddrsnipNormalizeUrlExecutable(parameters.LastReleasedResource):
        name = 'addrsnip_normalize_url_executable'
        description = 'normalize_url executable'
        resource_type = grt.ADDRSNIP_NORMALIZE_URL_EXECUTABLE

    class MetaSearchRPS(parameters.SandboxIntegerParameter):
        name = 'metasearch_rps'
        description = 'Maximum RPS to metasearch (makes sense in geojson mode only)'
        default_value = 50
        required = True

    class DebugMode(parameters.SandboxBoolParameter):
        name = 'debug_mode'
        description = 'Enable debug logging'
        default_value = False

    type = "BUILD_ADDR_SNIPPET_DATA"
    environment = (
        environments.PipEnvironment('yandex-yt', use_wheel=True),
    )
    input_parameters = (WorkersNum, WorkerTimeout, ItemsProcessLimit, MetaSearchRPS, UseAggregatorsFile,
                        PrepareSaasQuerydata, UpdateSaasQuerydataProd, UpdateSaasQuerydataTesting,
                        DebugMode, SourceTask, LatestLinkName,
                        MakeAddrsnipDataExecutable, AddrsnipNormalizeUrlExecutable) \
                        + PrepareSaasQuerysearchData.input_parameters
    required_ram = 25 * 1024

    client_tags = ctc.Tag.Group.LINUX

    JSON_PATH_RAW = "raw.tsv"
    JSON_PATH_NORM = "normalized.tsv"

    GEODATA_BIN_PATH = None

    def get_src_dir(self):
        return self.ctx['src_dir']

    def run_process(self, *args, **kwargs):
        """ run with logging to errorlog """
        with open(os.path.join(get_logs_folder(), 'stderr.log'), 'a') as errorlog:
            kwargs['stderr'] = errorlog
            return run_process(*args, **kwargs)

    def initCtx(self):
        self.ctx['kill_timeout'] = 60 * 60 * 24

    def get_aggregators_file(self):
        return self.abs_path(self.get_src_dir() + '/search/web/rearrs_upper/rearrange.dynamic/dumper/request_param_filter.json')

    def prepare_data_for_subtasks(self):
        num_workers = get_or_default(self.ctx, self.WorkersNum)
        # sort backa data by oid to save queries and merge with url data later
        with open('addrsnip.sorted.csv', 'w') as ft, open(os.path.join(get_logs_folder(), 'sort.err'), 'w') as ferr:
            run_process('head -n1 addrsnip.csv', stdout=ft)
            run_process('/usr/bin/env bash -c "tail -n +2 addrsnip.csv | LC_ALL=C sort -u | LC_ALL=C sort -t $\'\\t\' -k18"', stdout=ft, stderr=ferr, shell=True)

        make_addrsnip_data_exe = self.sync_resource(get_or_default(self.ctx, self.MakeAddrsnipDataExecutable))
        params = [
            make_addrsnip_data_exe,
            '--stage', '1',
            '--outfile', 'ADDRSNIP',
            '--number_files', str(num_workers),
            '--geodata', self.GEODATA_BIN_PATH,
            '--backa', 'addrsnip.sorted.csv',
            '--limit', str(get_or_default(self.ctx, self.ItemsProcessLimit)),
        ]
        if get_or_default(self.ctx, self.UseAggregatorsFile):
            params.extend(['--aggregators', self.get_aggregators_file()])
        if get_or_default(self.ctx, self.DebugMode):
            params.append('--debug')

        sprav_layer = get_last_resource_with_attribute('SPRAV_LAYER_CREATOR_RESOURCE', 'released', 'stable')
        if sprav_layer:
            path = self.sync_resource(sprav_layer)
            run_process(['tar', 'zxvf', path])
            params.extend(['--dict-dir', 'var/cache/sprav'])

        print_folder_tree(self, self.abs_path(), recursive=True)
        self.run_process(params)

        self.ctx['data_ids'] = []
        for i in range(num_workers):
            res = self.create_resource("raw data {}".format(i), "ADDRSNIP{}.pkl".format(i), resource_types.OTHER_RESOURCE)
            self.mark_resource_ready(res)
            self.ctx['data_ids'].append(res.id)

    def run_subtasks(self):
        self.notifications = []  # turn off notifications from subtasks
        N = get_or_default(self.ctx, self.WorkersNum)
        subtask_params = {
            self.MetaSearchRPS.name: max(1, get_or_default(self.ctx, self.MetaSearchRPS) / N),
            self.DebugMode.name: get_or_default(self.ctx, self.DebugMode),
            self.MakeAddrsnipDataExecutable.name: get_or_default(self.ctx, self.MakeAddrsnipDataExecutable),
            'kill_timeout': int(get_or_default(self.ctx, self.WorkerTimeout)) * 60 * 60,
        }
        for i in range(N):
            subtask_params.update({
                'input_resource': self.ctx['data_ids'][i],
            })
            BuildAddrSnippetDataRequester(
                sdk2.Task.current,
                description="Requester #{}/{}".format(i + 1, N),
                **subtask_params
            ).enqueue()

    def aggregate_from_subtasks(self):
        subtasks = self.list_subtasks(load=False)
        if self.ctx[self.SourceTask.name]:
            logging.info("Will get child tasks' resources from {}".format(self.ctx[self.SourceTask.name]))
            subtasks = [task.id for task in channel.sandbox.list_tasks(parent_id=self.ctx[self.SourceTask.name])]
        with open(self.JSON_PATH_RAW, 'w') as fout:
            for child in subtasks:
                path = self.sync_resource(get_task_resource_id(child, resource_types.PLAIN_TEXT))
                with open(path) as fin:
                    for l in fin:
                        fout.write(l)
        self.mark_resource_ready(self.create_resource("raw tsv (full, debug)", self.JSON_PATH_RAW, resource_types.OTHER_RESOURCE))

    def get_export_data(self):
        res = self.sync_resource(get_last_resource_with_attribute(ADDRESS_SNIPPET_TSV_EXPORT, "released", "stable"))
        shutil.copy(res, "addrsnip.csv.gz")
        run_process('gzip -d addrsnip.csv.gz')

    def prepare_data(self):
        with self.memoize_stage.stage1(commit_on_entrance=False):
            logging.info("Enter prepare data")
            self.get_export_data()

            geobase = get_latest_resource(resource_type="GEODATA4BIN_STABLE")
            if geobase:
                resource = self.sync_resource(geobase.id)
                self.GEODATA_BIN_PATH = self.abs_path('geodata4.bin')
                arch_path = self.GEODATA_BIN_PATH + '.tar.gz'
                shutil.copy(resource, arch_path)
                self.run_process('tar -xzf {0}'.format(arch_path))
                logging.info("geobase synced to {0}".format(self.GEODATA_BIN_PATH))
            else:
                raise SandboxTaskFailureError("failed to find geodata resource")

            if not self.ctx.get(self.SourceTask.name):
                self.prepare_data_for_subtasks()
                self.run_subtasks()

        check_subtasks_fails(fail_on_first_failure=True)
        self.aggregate_from_subtasks()

    def normalize_urls(self):
        normalize_url_exe = self.sync_resource(get_or_default(self.ctx, self.AddrsnipNormalizeUrlExecutable))
        with open(self.JSON_PATH_RAW) as fin, open(self.JSON_PATH_NORM, 'w') as fout:
            self.run_process(normalize_url_exe, stdin=fin, stdout=fout)

    def get_data_for_yt(self):
        with open(self.JSON_PATH_NORM) as fin:
            for line in fin:
                fields = line.strip().split('\t')
                doc_url, region, lang, json_data = fields
                yield {
                    "Subkey_Url": doc_url,
                    "Subkey_UserRegion": region,
                    "Subkey_SerpUIL": lang,
                    "Data_JSON": json_data
                }

    def on_execute(self):
        with arc.Arc().mount_path(None, None, fetch_all=False) as src_dir:
            self.ctx['src_dir'] = src_dir
            self.prepare_data()
            self.normalize_urls()
            if get_or_default(self.ctx, PrepareSaasQuerydata):
                PrepareSaasQuerysearchData.upload_to_yt(self,
                    self.get_data_for_yt(), 'addrsnip', self.get_vault_data('GEOMETA-SEARCH', 'yt-token'), self.ctx.get(self.LatestLinkName.name, None))
                if get_or_default(self.ctx, UpdateSaasQuerydataTesting):
                    PrepareSaasQuerysearchData.on_release(self, True)
                if get_or_default(self.ctx, UpdateSaasQuerydataProd):
                    PrepareSaasQuerysearchData.on_release(self, False)


__Task__ = BuildAddrSnippetData
