# -*- coding: utf-8 -*-

import json
import logging
import pathlib2
import os
import shutil
import tarfile
from concurrent import futures

from sandbox.sdk2.helpers import subprocess as sp
from sandbox.sdk2.helpers import ProgressMeter
from sandbox.projects import resource_types
from sandbox.common.errors import TaskFailure

from sandbox import sdk2

import sandbox.common.types.client as ctc

from sandbox.projects.news import resources

from sandbox.projects.news.BuildNewsAnnotatorPackage import NEWS_ANNOTATOR_PACKAGE
from sandbox.projects.news.BuildNewsStandaloneAnnotator import NEWS_STANDALONE_ANNOTATOR_EXECUTABLE
from sandbox.projects.news.BuildNewsAnnotatorConfigBundle import NEWS_ANNOTATOR_CONFIG
from sandbox.projects.news.BuildNewsSkybitConfigBundle import NEWS_2LD_LIST


def recreate_directory_struct(src, dst):
    src = os.path.abspath(src)
    for (dirpath, dirnames, filenames) in os.walk(src, followlinks=True):
        relpath = os.path.relpath(dirpath, src)
        dst_path = os.path.join(dst, relpath)
        try:
            os.makedirs(dst_path)
        except OSError:
            pass
        for name in filenames:
            os.symlink(os.path.join(dirpath, name), os.path.join(dst_path, name))


def untar_all(archive_path, target):
    if os.path.isdir(archive_path):
        recreate_directory_struct(archive_path, target)
        return

    with tarfile.open(archive_path, 'r:*') as package:
        package.extractall(target)


def get_resource_path_sync(resource, name=None):
    if name:
        logging.debug("Getting %s", name)
    resource_path = str(sdk2.ResourceData(resource).path)
    return resource_path


def get_resource_and_unpack_sync(resource, target, name=None, create_target=False):
    path = get_resource_path_sync(resource, name)
    if create_target:
        pathlib2.Path(target).mkdir(parents=True, exist_ok=True)
    untar_all(path, target)


class GetNewsAnnotatorResponses(sdk2.Task):
    '''
        Получает и сохраняет результаты работы аннотатора
    '''

    class Parameters(sdk2.Task.Parameters):
        with sdk2.parameters.Group("config parameters") as config_block:
            annotator_config = sdk2.parameters.Resource('annotator config', resource_type=NEWS_ANNOTATOR_CONFIG, required=True)
            config_patch = sdk2.parameters.Resource('config patch', required=False)
            patch_executable = sdk2.parameters.Resource('patch executable (jq)', resource_type=resource_types.JQ_EXECUTABLE, required=False)
        annotator_executable = sdk2.parameters.Resource('annotator executable', resource_type=NEWS_STANDALONE_ANNOTATOR_EXECUTABLE, required=True)
        news_dump = sdk2.parameters.Resource('raw news docs dump', resource_type=resources.NEWS_RAW_NEWS_DOC_DUMP, required=True)
        strict_mode = sdk2.parameters.Bool('fail if got error', default=True)
        disable_entity_cache = sdk2.parameters.Bool('disable entity cache', default=False)
        test_mode = sdk2.parameters.Bool('disable some optimizations that interfere with result reproducibility', default=True)
        with sdk2.parameters.Group("data") as data_block:
            annotator_package = sdk2.parameters.Resource('annotator package', resource_type=NEWS_ANNOTATOR_PACKAGE, required=True)
            regexp_patches = sdk2.parameters.Resource('regexp patches', resource_type=resources.NEWS_REGEXP_PATCHES, required=False)
            news_2ld_list = sdk2.parameters.Resource('2ld list', resource_type=NEWS_2LD_LIST, required=True)
            news_backoffice_data = sdk2.parameters.Resource('backoffice data', resource_type=resource_types.NEWS_BACKOFFICE_DATA, required=True)
            yane_data = sdk2.parameters.Resource('Data for dict/nerlib', resource_type=resource_types.YANE_DATA, required=False)
        out_responses_parent_resource = sdk2.parameters.ParentResource('Annotator responses', resource_type=resources.NEWS_NEWS_DOC_INFO_DUMP, required=False, do_not_copy=True)
        with sdk2.parameters.Group("debug parameters") as debug_block:
            use_strace = sdk2.parameters.Bool('run under strace', default=False)

    class Requirements(sdk2.Task.Requirements):
        client_tags = ctc.Tag.Group.LINUX
        disk_space = 75 * 1024
        ram = 65 * 1024
        cores = 8

        class Caches(sdk2.Requirements.Caches):
            pass

    def on_enqueue(self):
        if self.Parameters.out_responses_parent_resource:
            self.Context.out_resource_id = self.Parameters.out_responses_parent_resource.id
        else:
            self.Context.out_resource_id = resources.NEWS_NEWS_DOC_INFO_DUMP(self, "annotator responses", "info.yson").id

    def on_execute(self):
        out_resource = sdk2.ResourceData(sdk2.Resource[self.Context.out_resource_id])

        workdir = str(self.path('work'))
        shutil.rmtree(workdir, ignore_errors=True)  # clear on restart
        datadir = os.path.join(workdir, 'data')
        pathlib2.Path(datadir).mkdir(parents=True, exist_ok=True)
        dynamicdir = os.path.join(workdir, 'dynamic_data')
        pathlib2.Path(dynamicdir).mkdir(parents=True, exist_ok=True)

        annotator_config_path = str(sdk2.ResourceData(self.Parameters.annotator_config).path)

        if self.Parameters.config_patch:
            config_patch_path = str(sdk2.ResourceData(self.Parameters.config_patch).path)
            patch_executable_path = str(sdk2.ResourceData(self.Parameters.patch_executable).path)
            new_config_path = os.path.join(workdir, 'patched_annotator.json')
            with sdk2.helpers.ProcessLog(self, logger="patch") as pl:
                with open(new_config_path, 'w') as f:
                    sp.check_call([patch_executable_path, '--from-file', config_patch_path, annotator_config_path], stdout=f, stderr=pl.stderr, cwd=workdir)
            shutil.copy2(new_config_path, str(self.log_path('patched_annotator.json')))  # log patch
            annotator_config_path = new_config_path

        if self.Parameters.regexp_patches:
            text_patches_path = str(sdk2.ResourceData(self.Parameters.regexp_patches).path)
        else:
            logging.info("Using empty patches file")
            text_patches_path = os.path.join(workdir, "empty_array.json")
            with open(text_patches_path, 'w') as f:
                f.write("[]\n")
        annotator_resources_config_path = os.path.join(workdir, self._get_resources_config_name(annotator_config_path))
        logging.debug("Generating resources config %s", annotator_resources_config_path)
        self._generate_resources_config(annotator_resources_config_path, text_patches_path)
        shutil.copy2(annotator_resources_config_path, str(self.log_path('resources_config.json')))  # log generated config

        news_2ld_list_path = str(sdk2.ResourceData(self.Parameters.news_2ld_list).path)
        os.symlink(news_2ld_list_path, os.path.join(workdir, '2ld.list'))

        logging.info("Extracting data")
        with ProgressMeter("Extracting data"), futures.ThreadPoolExecutor(max_workers=8) as executor:
            if self.Parameters.yane_data:
                executor.submit(get_resource_and_unpack_sync, self.Parameters.yane_data, os.path.join(workdir, 'ner_data'), 'YANE_DATA', create_target=True)
            executor.submit(get_resource_and_unpack_sync, self.Parameters.news_backoffice_data, dynamicdir, 'NEWS_BACKOFFICE_DATA')
            executor.submit(get_resource_and_unpack_sync, self.Parameters.annotator_package, datadir, 'NEWS_ANNOTATOR_PACKAGE')
            annotator_executable_path_future = executor.submit(get_resource_path_sync, self.Parameters.annotator_executable)
            news_dump_path_future = executor.submit(get_resource_path_sync, self.Parameters.news_dump)

            annotator_executable_path = annotator_executable_path_future.result()
            news_dump_path = news_dump_path_future.result()

        cmd_line = [annotator_executable_path, annotator_config_path, '--streaming', 'doc', '--streaming-news']
        if self.Parameters.strict_mode:
            cmd_line.append('--strict-streaming')
        if self.Parameters.disable_entity_cache:
            cmd_line.append('--disable-entity-cache')
        if self.Parameters.test_mode:
            cmd_line.append('--test-mode')

        if self.Parameters.use_strace:
            cmd_line = ['strace', '-f', '-e', 'trace=open,openat,close'] + cmd_line

        env = os.environ.copy()
        env["MKL_CBWR"] = "COMPATIBLE"
        with open(news_dump_path, 'rb') as news_input, open(str(out_resource.path), 'wb') as info_output:
            with sdk2.helpers.ProcessLog(self, logger="annotator") as pl:
                try:
                    sp.check_call(cmd_line, stdout=info_output, stdin=news_input, stderr=pl.stderr, cwd=workdir, env=env)
                except Exception as e:
                    out_resource.broken()  # explicitly broke parent's resource
                    raise TaskFailure(str(e))

        out_resource.ready()

    @staticmethod
    def _get_resources_config_name(annotator_config_path):
        with open(annotator_config_path, 'r') as config_file:
            cfg = json.load(config_file)
            return cfg.get("ResourcesConfig", "resources.json")

    @staticmethod
    def _generate_resources_config(target, text_patches_path):
        config = {
            "cache_dir": "resources_cache",
            "resources": [
                {
                    "name": "text-patches",
                    "uri": "file://{}".format(text_patches_path),
                }
            ]
        }
        with open(target, 'w') as f:
            json.dump(config, f, indent=4, separators=(',', ': '), sort_keys=True)
