# -*- coding: utf-8 -*-
import datetime
import os
import re
import pathlib2
import platform
import random
import logging

from sandbox.sandboxsdk import environments
from sandbox import sdk2
from sandbox.sdk2.helpers import subprocess as sp
from sandbox.common.errors import TaskFailure
from sandbox.common.platform import get_arch_from_platform
from sandbox.common.types import (
    misc as ctm,
    client as ctc,
    resource as ctr,
)

from sandbox.projects.UrlsByShowCounters.resource_types import (
    URLS_BY_SHOW_COUNTERS_EXECUTABLE,
    URLS_BY_SHOW_COUNTERS_DATA,
    URLS_BY_SHOW_COUNTERS_DEBUG_DATA,
    ABSOLUTE_URLS_BY_SHOW_COUNTERS_DATA,
    QUERIES_BY_SHOW_COUNTERS,
)

from sandbox.projects.sandbox_ci.sandbox_ci_web4_generate_baobab_counters.resources import (
    SANDBOX_CI_WEB4_BAOBAB_COUNTERS
)

INPUT_TABLE_BASE_PATH = pathlib2.Path('//logs/search-web-blockstat-log')
OUTPUT_TABLE_BASE_PATH = pathlib2.Path('//home/yframe')

YT_PROXY = 'hahn.yt.yandex.net'
YT_CLUSTER = 'hahn'


class UrlsByShowCounters(sdk2.Task):
    class Requirements(sdk2.Task.Requirements):
        dns = ctm.DnsType.DNS64
        disk_space = 1024
        client_tags = ctc.Tag.LINUX_PRECISE
        environments = (
            environments.PipEnvironment('yandex-yt'),
        )

    class Parameters(sdk2.Task.Parameters):
        kill_timeout = 60 * 60 * 2  # два часа

        token_owner = sdk2.parameters.String(
            'Vault owner to extract token',
            required=True,
            default='SANDBOX_CI_SEARCH_INTERFACES',
        )

        token_name = sdk2.parameters.String(
            'Vault name to extract token',
            required=True,
            default='robot-drunken-flash-yt',
        )

        output_table_folder = sdk2.parameters.String(
            'YT output table folder',
            required=True,
            default='FEI-7133-features-by-counters',
        )

        # sdk2.parameters.Integer does not have choices
        max_urls = sdk2.parameters.String(
            'Max urls per counter and platform',
            required=True,
            default='10',
            choices=[
                ('1', '1'),
                ('10', '10'),
                ('100', '100'),
            ],
        )

        # sdk2.parameters.Integer does not have choices
        max_urls_per_tld = sdk2.parameters.String(
            'Max urls per tld for each counter',
            required=True,
            default='10',
            choices=[
                ('1', '1'),
                ('10', '10'),
                ('100', '100'),
            ],
        )

        need_tlds_coverage = sdk2.parameters.Bool(
            "TLD's coverage",
            description='Пытаться покрыть не только RU-домен, но и прочие TLD',
            required=True,
            default=False,
        )

        data_format = sdk2.parameters.String(
            'Data format to save resources',
            required=True,
            default='json',
            choices=[
                ('json', 'json'),
                ('leveldb', 'leveldb'),
            ],
        )

        wait_time = sdk2.parameters.Integer(
            'Time in seconds that the task sleeps between checks of status for YT MapReduce operation',
            default=30 * 60,
        )

        with sdk2.parameters.Group('Debug') as debug_block:
            debug_mode = sdk2.parameters.Bool('Debug mode', default=False)

            with debug_mode.value[True]:
                number_of_rows = sdk2.parameters.Integer(
                    'Number of rows to process (debug mode only), set 0 for all rows',
                    default=10000000,
                )

        executable_resource = sdk2.parameters.Resource(
            'Task executable, will use last released if not specified',
            resource_type=URLS_BY_SHOW_COUNTERS_EXECUTABLE,
        )

    class Context(sdk2.Task.Context):
        yt_operation_id = None
        output_table_path = None

    def on_execute(self):
        data_format = self.Parameters.data_format
        max_urls = int(self.Parameters.max_urls)
        max_urls_per_tld = int(self.Parameters.max_urls_per_tld)
        need_tlds_coverage = self.Parameters.need_tlds_coverage
        yt_token = sdk2.Vault.data(self.Parameters.token_owner, self.Parameters.token_name)

        # первый запуск on_execute должен запустить YT операцию MapReduce и сохранить её ID,
        # и после этого отправить задачу sandbox в состояние ожидания.
        # подробнее в FEI-11338
        if not self.Context.yt_operation_id:
            self.run_map_reduce(data_format, max_urls, max_urls_per_tld, need_tlds_coverage, yt_token)
            raise sdk2.WaitTime(self.Parameters.wait_time)

        # установка YT в окружение занимает около секунды, поэтому нет смысла выносить проверку в отдельный бинарник (./executable/bin/main.py)
        import yt.wrapper as yt
        yt_client = yt.YtClient(YT_PROXY, yt_token)
        state = yt.get_operation_state(self.Context.yt_operation_id, client=yt_client)

        # is_finished не гарантирует корректное завершение, поэтому поверяем сначала на is_unsuccessfully_finished
        if state.is_unsuccessfully_finished():
            raise TaskFailure('YT operation MapReduce unsuccessfully finished')
        elif state.is_finished():
            self.prepare_result(data_format, max_urls, need_tlds_coverage, yt_token)
        else:
            raise sdk2.WaitTime(self.Parameters.wait_time)

    def get_exec_path(self):
        current_arch = get_arch_from_platform(platform.platform())
        executable_resource = self.Parameters.executable_resource
        if executable_resource is None:
            executable_resource = URLS_BY_SHOW_COUNTERS_EXECUTABLE.find(
                state=ctr.State.READY,
                arch=current_arch,
                attrs=dict(
                    released='stable',
                ),
            ).first()

        if executable_resource is None:
            raise TaskFailure('Could not find suitable URLS_BY_SHOW_COUNTERS_EXECUTABLE resource')

        return str(sdk2.ResourceData(executable_resource).path)

    def get_baobab_counters_path(self):
        baobab_counters = SANDBOX_CI_WEB4_BAOBAB_COUNTERS.find(
            state=ctr.State.READY,
            attrs=dict(
                debug=False,
            ),
        ).order(-sdk2.Resource.id).first()

        if baobab_counters is None:
            return None

        return str(sdk2.ResourceData(baobab_counters).path)

    def run_map_reduce(self, data_format, max_urls, max_urls_per_tld, need_tlds_coverage, yt_token):
        """
        :type data_format: str
        :type max_urls: int
        :type max_urls_per_tld: int
        :type need_tlds_coverage: bool
        :type yt_token: str
        """

        debug_mode = self.Parameters.debug_mode
        today = datetime.date.today()
        yesterday = today - datetime.timedelta(days=1)
        output_table_folder = self.Parameters.output_table_folder

        input_table_path = INPUT_TABLE_BASE_PATH / '1d' / str(yesterday)
        output_table_name = "{}_{}_{}".format(today, max_urls, data_format)

        if debug_mode:
            # FEI-7492: добавляем суффикс к адресу таблицы в режиме отладки
            output_table_folder = '{}-debug'.format(output_table_folder)
            # Уменьшаем коллизии по именам таблиц при отладке
            output_table_name = '{}_{}'.format(
                datetime.datetime.now().strftime('%s'), random.randint(0, 999999))

            # FEI-7492: ограничиваем число обрабатываемых строк в режиме отладки
            if self.Parameters.number_of_rows > 0:
                input_table_path = '{}[:#{}]'.format(input_table_path, self.Parameters.number_of_rows)

        output_table_path = OUTPUT_TABLE_BASE_PATH / output_table_folder / output_table_name

        input_table_path = str(input_table_path)
        output_table_path = str(output_table_path)
        with sdk2.helpers.ProcessLog(self, logger="urls_by_show_counters_mapreduce"):
            proc_args = [
                self.get_exec_path(),
                'run-map-reduce',
                '--input-table', input_table_path,
                '--output-table', output_table_path,
                '--max-urls', str(max_urls),
                '--max-urls-per-tld', str(max_urls_per_tld),
            ]

            if debug_mode:
                proc_args.append('--debug')

            if need_tlds_coverage:
                proc_args.append('--tlds-coverage')

            baobab_counters_path = self.get_baobab_counters_path()

            if baobab_counters_path is not None:
                proc_args.append('--baobab-counters-path')
                proc_args.append(baobab_counters_path)

            proc = sp.Popen(
                proc_args,
                env=get_env(yt_token),
                stdout=sp.PIPE,
                stderr=sp.STDOUT,
            )
            stdout, stderr = proc.communicate()

            logging.info('urls_by_show_counters STDOUT:\n %s\nSTDERR:\n%s', stdout, stderr)
            search_result = re.search(r'yt_operation_for_urls_by_show_counters[\s]*([^\s]+)', stdout)
            if not search_result:
                raise TaskFailure('Cannot get YT operation id from log urls_by_show_counters.')

            self.Context.yt_operation_id = search_result.group(1)
            self.Context.output_table_path = output_table_path
            self.update_description()

    def prepare_result(self, data_format, max_urls, tlds_coverage, yt_token):
        output_table_path = self.Context.output_table_path
        output_queries_path = './result_queries.{}'.format(data_format)
        output_absolute_path = './result_absolute.{}'.format(data_format)
        output_relative_path = './result.{}'.format(data_format)

        with sdk2.helpers.ProcessLog(self, logger="urls_by_show_counters_prepare_result") as pl:
            sp.check_call(
                [
                    self.get_exec_path(),
                    'prepare-result',
                    '--output-table', output_table_path,
                    '--data-format', data_format,
                    '--output-absolute', output_absolute_path,
                    '--output-relative', output_relative_path,
                    '--output-queries', output_queries_path,
                ],
                env=get_env(yt_token),
                stdout=pl.stdout,
                stderr=sp.STDOUT,
            )

        save_resources = [
            (output_absolute_path, ABSOLUTE_URLS_BY_SHOW_COUNTERS_DATA),
            (output_relative_path, URLS_BY_SHOW_COUNTERS_DATA),
            (output_queries_path, QUERIES_BY_SHOW_COUNTERS),
        ]
        for (resource_path, resource_type) in save_resources:
            # FEI-7492: используем отдельный тип ресурса в отладочном режиме
            if self.Parameters.debug_mode:
                resource_type = URLS_BY_SHOW_COUNTERS_DEBUG_DATA

            resource = sdk2.ResourceData(resource_type(
                self, self.Parameters.description, resource_path,
                format=data_format,
                data_format=data_format,
                max_urls=max_urls,
                tlds_coverage=tlds_coverage,
            ))

            resource.ready()

    def update_description(self):
        operation_url = 'https://yt.yandex-team.ru/{YT_CLUSTER}/operations/{operation_id}/'.format(
            operation_id=self.Context.yt_operation_id,
            YT_CLUSTER=YT_CLUSTER
        )
        operation_link = 'YT Operation: <a href="{url}">{operation_id}</a>'.format(
            url=operation_url,
            operation_id=self.Context.yt_operation_id
        )
        self.Parameters.description = self.Parameters.description + '\n\n' + operation_link


def get_env(yt_token):
    exec_env = os.environ.copy()
    exec_env['YT_TOKEN'] = yt_token
    exec_env['YT_PROXY'] = YT_PROXY

    return exec_env
