# -*- coding: utf-8 -*-

import os
import re
from tempfile import mkdtemp

from sandbox import common
from sandbox.common.fs import WorkDir

from sandbox.projects.resource_types import \
        BROADMATCH_MR_CATALOGIA, BROADMATCH_MR_CATALOGIA_EXTERNAL, \
        BROADMATCH_MR_CATALOGIA_RAW, \
        PLAIN_TEXT

import logging

from sandbox.sandboxsdk.errors import SandboxTaskFailureError
from sandbox.sandboxsdk.task import SandboxTask
from sandbox.sandboxsdk.parameters import ResourceSelector, SandboxFloatParameter, SandboxBoolParameter
from sandbox.sandboxsdk.process import run_process
from sandbox.sandboxsdk.paths import list_dir
from sandbox.projects.common.utils import set_resource_attributes
from sandbox.sandboxsdk import channel


class BroadmatchBuildMRCatalogia(SandboxTask):
    """ Task to build BROADMATCH_MR_CATALOGIA from raw data and BROADMATCH_MR_CATALOGIA_EXTERNAL """
    @common.utils.singleton_classproperty
    def sandbox(self):
        return common.rest.Client()

    def locate_by_type(cls, rtype, attrs='{"released": "stable"}'):
        logging.info("locating {}".format(str(rtype)))
        resources = cls.sandbox.resource.read({
            "type": str(rtype),
            "limit": 1,
            "order": "-id",
            "attrs": attrs,
            "state": "READY",
        })['items']
        logging.info(resources)
        return resources[0]['id']

    class CleanOutDSSM(SandboxBoolParameter):
        name = "clean_out_dssm"
        description = "Do clean out DSSM models"
        default_value = False
        required = True
        group = "Special"

    class RawCatalogia(ResourceSelector):
        name = "raw_catalogia"
        description = "Raw catalogia from production"
        resource_type = BROADMATCH_MR_CATALOGIA_RAW
        latest = None

    class ExternalCatalogia(ResourceSelector):
        name = "external_catalogia"
        description = "External data for catalogia"
        resource_type = BROADMATCH_MR_CATALOGIA_EXTERNAL

    class MRCatalogia(ResourceSelector):
        name = "mr_catalogia"
        description = "Built MR Catalogia (for run test only)"
        resource_type = BROADMATCH_MR_CATALOGIA
        group = "Test only"

    class TestInput(ResourceSelector):
        name = "test_input"
        description = "Phrases to categorize"
        resource_type = PLAIN_TEXT
        group = "Test"

    class TestOutput(ResourceSelector):
        name = "test_output"
        description = "Expected categorize result"
        resource_type = PLAIN_TEXT
        group = "Test"

    class ExpectedCorrectRate(SandboxFloatParameter):
        name = "expected_correct_rate"
        description = "Expected correct lines rate"
        default_value = 0.98
        required = True
        group = "Test"

    type = "BROADMATCH_BUILD_MR_CATALOGIA"
    input_parameters = [RawCatalogia, ExternalCatalogia, CleanOutDSSM, TestInput, TestOutput, ExpectedCorrectRate, MRCatalogia]

    def validate_categorization(self, input_path, categories_path, output_path, diff_path):
        # Check diff is not more then ExpectedCorrectRate
        total_lines = 0
        total_results = 0
        total_extra = 0
        total_lost = 0
        total_correct = 0
        with open(input_path) as input_fd, open(categories_path) as categories_fd, open(output_path) as output_fd, open(diff_path, 'w') as diff_fd:
            for in_line in input_fd:
                total_lines += 1
                in_line = in_line.rstrip()

                cat_line = categories_fd.readline().rstrip()
                cat_cats = filter(lambda x: x != "", re.split(',(?=\S)', cat_line))
                cat_cats.sort()

                out_line = output_fd.readline().rstrip()
                out_cats = filter(lambda x: x != "", re.split(',(?=\S)', out_line))
                out_cats.sort()

                msg = ""

                for cat_cat in cat_cats:
                    total_results += 1
                    while len(out_cats) and out_cats[0] < cat_cat:
                        msg += "\n\tlost {}".format(out_cats[0])
                        total_lost += 1

                        out_cats.pop(0)
                    if not len(out_cats) or out_cats[0] > cat_cat:
                        msg += "\n\textra {}".format(cat_cat)
                        total_extra += 1

                        continue
                    if len(out_cats) and out_cats[0] == cat_cat:
                        out_cats.pop(0)
                while len(out_cats):
                    msg += "\n\tlost {}".format(out_cats[0])
                    total_lost += 1

                    out_cats.pop(0)
                if msg != "":
                    diff_fd.write("{}: {}  -*expected*-  {}  -*got*-  {}{}\n".format(str(total_lines), in_line, out_line, cat_line, msg))
                else:
                    total_correct += 1

            diff_fd.write("\n\tTOTAL:\n\tlines: {}\n\tcorrect lines: {}\n\tcategories: {}\n\textra categories: {}\n\tlost categories: {}\n".format(
                total_lines, total_correct, total_results, total_extra, total_lost
            ))

        diff_res = self.create_resource(
            description="Categorization diff",
            resource_path=diff_path,
            resource_type=PLAIN_TEXT,
        )
        self.mark_resource_ready(diff_res.id)

        if (total_correct * 1.0 / total_lines) < self.ctx.get(self.ExpectedCorrectRate.name):
            raise SandboxTaskFailureError("Too many incorrectly categorized lines!")

    def on_execute(self):

        catalogia_res = self.ctx.get(self.MRCatalogia.name)
        need_build = catalogia_res is None

        if need_build:
            logging.info("I will build!")

            # === Get source resources ===
            raw_resource = self.ctx.get(self.RawCatalogia.name)
            if not raw_resource:
                raw_resource = self.locate_by_type(self.RawCatalogia.resource_type, attrs='')
                self.ctx[self.RawCatalogia.name] = raw_resource
            raw_path = self.sync_resource(raw_resource)
            logging.info("RawCatalogia {} at {}".format(raw_resource, raw_path))

            external_resource = self.ctx.get(self.ExternalCatalogia.name)
            if not external_resource:
                external_resource = self.locate_by_type(self.ExternalCatalogia.resource_type, attrs='{"released":"stable", "type":"external_pack"}')
                self.ctx[self.ExternalCatalogia.name] = external_resource
            external_path = self.sync_resource(external_resource)
            logging.info("ExternalCatalogia {} at {}".format(external_resource, external_path))

            # /// Get source resources ///

        # === Prepare test data ===
        categories_path = self.abs_path('categories')
        diff_path = self.abs_path('categories_diff')

        input_resource = self.ctx.get(self.TestInput.name)
        input_path = self.abs_path('categories_input')
        if input_resource:
            input_path = self.sync_resource(input_resource)
        else:
            run_process(
                'echo "bmw" > {}; echo "купить бмв" >> {}'.format(input_path, input_path),
                shell=True,
            )

        output_resource = self.ctx.get(self.TestOutput.name)
        output_path = self.abs_path('categories_output')
        if output_resource:
            output_path = self.sync_resource(output_resource)
        else:
            run_process(
                'echo "Автомобили" > {}; echo "Покупка от дилера _ Автомобили" >> {}'.format(output_path, output_path),
                shell=True,
            )
        # /// Prepare test data ///

        work_dir = mkdtemp(prefix='work')

        if need_build:
            catalogia_path = self.abs_path('mr_catalogia.tar')
            with WorkDir(work_dir):
                # Untar raw catalogia
                os.mkdir('broadmatching')
                run_process('tar -xzf {}'.format(raw_path), shell=True, work_dir='broadmatching')

                # Untar patches
                os.mkdir('external')
                run_process('tar -xzf {}'.format(external_path), shell=True)

                if self.ctx.get(self.CleanOutDSSM.name):
                    run_process('rm dicts/forecast/dssm/*', shell=True, work_dir='broadmatching')

                # run local_catalogia on several phrases
                os.mkdir('run')
                custom_env = os.environ.copy()
                custom_env['PERL5LIB'] = '../broadmatching/scripts/lib'
                run_process(
                    'ls ../external/ | while read f; do if [ -f ../external/$f ]; then ln -s ../external/$f $f; else cp -r ../external/$f $f; fi; done && ' +
                    'cat {} | ./mr_perl ./categorize_phrases.pl > {}'.format(input_path, categories_path),
                    shell=True,
                    log_prefix='run_categorize',
                    work_dir='run',
                    environment=custom_env,
                )
                categories_res = self.create_resource(
                    description="Categorize result",
                    resource_path=categories_path,
                    resource_type=PLAIN_TEXT,
                )
                self.mark_resource_ready(categories_res.id)

                # retar everything
                bm_files = list_dir('broadmatching')
                bm_files_str = ''
                MODERN_TARGET_DIR = '_mr_target_dir'
                LEGACY_TARGET_DIR = 'target_dir'
                if MODERN_TARGET_DIR in bm_files:
                    bm_files_str = " ".join([MODERN_TARGET_DIR] + filter(lambda f: f != MODERN_TARGET_DIR, bm_files))
                elif LEGACY_TARGET_DIR in bm_files:
                    bm_files_str = " ".join([LEGACY_TARGET_DIR] + filter(lambda f: f != LEGACY_TARGET_DIR, bm_files))
                else:
                    run_process('echo lib > {}'.format(MODERN_TARGET_DIR), shell=True, work_dir='broadmatching')
                    bm_files_str = " ".join([MODERN_TARGET_DIR] + bm_files)

                run_process('tar -czf ../broadmatching.tar.gz {}'.format(bm_files_str), shell=True, work_dir='broadmatching')
                run_process('tar -cf {} broadmatching.tar.gz'.format(catalogia_path), shell=True)
                run_process('tar -rf {} *'.format(catalogia_path), shell=True, work_dir='external')

                catalogia_res = self.create_resource(
                    description="MR Catalogia",
                    resource_path=catalogia_path,
                    resource_type=BROADMATCH_MR_CATALOGIA,
                )
                production = channel.channel.sandbox.get_resource_attribute(raw_resource, 'production')
                if not production:
                    production = '0'
                set_resource_attributes(catalogia_res, {"production": production})
                self.validate_categorization(input_path, categories_path, output_path, diff_path)
                self.mark_resource_ready(catalogia_res.id)
            # / with WorkDir
        # / if need_build
        else:
            catalogia_path = self.sync_resource(catalogia_res)
            with WorkDir(work_dir):
                os.mkdir('run')
                run_process('tar -xf {}'.format(catalogia_path), shell=True, work_dir='run')
                run_process(
                    'cat {} | ./mr_perl ./categorize_phrases.pl > {}'.format(input_path, categories_path),
                    shell=True,
                    log_prefix='run_categorize',
                    work_dir='run'
                )
                categories_res = self.create_resource(
                    description="Categorize result",
                    resource_path=categories_path,
                    resource_type=PLAIN_TEXT,
                )
                self.validate_categorization(input_path, categories_path, output_path, diff_path)
                self.mark_resource_ready(categories_res.id)
            # / with WorkDir


__Task__ = BroadmatchBuildMRCatalogia
