import logging
import re
import sys
import time

from datetime import datetime
from enum import Enum
from os import environ, mkdir, path

import sandbox.common.types.misc as ctm

from sandbox import sdk2
from sandbox.common import errors
from sandbox.common.types import notification
from sandbox.sandboxsdk import environments

from sandbox.projects.maps.common.ecstatic_bin import MapsEcstaticToolMixin
from sandbox.projects.maps.common.juggler_alerts import TaskJugglerReportWithParameters

from parameters import (
    MapsStreetviewPoiPipelineParams,
)
from toloka_utils import (
    check_toloka_pool_finished,
    check_have_money_for_task,
    open_toloka_pool,
)
from utils import (
    filter_golden_set_table,
    run_binary_cmd,
    sample_golden_set,
    separate_golden_set,
    YtPathContext,
)

logging.basicConfig(
    stream=sys.stderr,
    level=logging.INFO,
    format='%(asctime)s %(levelname)s : %(message)s',
)
logger = logging.getLogger()

PUBLIC_MDS_BUCKET = 'maps-core-streetview-poi-toloka-1'
PRIVATE_MDS_BUCKET = 'maps-core-streetview-poi-toloka-sandbox'
PANO_DESCRIPTION_DATASET = 'yandex-maps-streetview-description'
ECSTATIC_ENV = 'stable'
ECSTATIC_BRANCH = 'stable'
ECSTATIC_ACTIVE_VERSIONS_REGEX = r'{}=(\S*).*'.format(PANO_DESCRIPTION_DATASET)
TOLOKA_TOOL_POOL_LINK_REGEX = r'pool avaliable at.*(https://.*/pool/\d+).*'
PANO_WIDTH_PIX = '1000'
PANO_HEIGHT_PIX = '1000'
SECONDS_PER_DAY = 86400


class NoNewDataError(Exception):
    pass


class TolokaStage(Enum):
    FIRST = 'first'
    SECOND = 'second'
    THIRD = 'third'


class MapsStreetviewPoiTolokaPipeline(TaskJugglerReportWithParameters, MapsEcstaticToolMixin):
    class Requirements(sdk2.Task.Requirements):
        environments = [
            environments.PipEnvironment('yandex-yt'),
            environments.PipEnvironment('yql'),
        ]

    class Parameters(MapsStreetviewPoiPipelineParams):
        pass

    def _setup_environment(self):
        logger.info('setting up environment')

        if self.Context.datetime_suffix is ctm.NotExists:
            self.Context.datetime_suffix = datetime.now().strftime('%Y-%m-%d_%H%M%S')
            if self.Parameters.force_suffix:
                self.Context.datetime_suffix = self.Parameters.force_suffix

        self._project_yt_dir = YtPathContext(self.Parameters.yt_root_path).subpath(self.Context.datetime_suffix)

        environ['YT_PROXY'] = 'hahn'
        environ['YT_TOKEN'] = self.Parameters.yt_token.data()
        environ['YQL_TOKEN'] = self.Parameters.yql_token.data()

        for token_dir in ['~/.aws', '~/.yt']:
            if not path.exists(path.expanduser(token_dir)):
                mkdir(path.expanduser(token_dir))

        with open(path.expanduser('~/.aws/credentials'), 'w') as fout:
            fout.writelines([
                '[default]\n',
                'aws_access_key_id = {}\n'.format(self.Parameters.aws_access_key_id.data()),
                'aws_secret_access_key = {}'.format(self.Parameters.aws_secret_access_key.data()),
            ])
        # some underlying tasks need token file instead of env
        with open(path.expanduser('~/.yt/token'), 'w') as fout:
            fout.write(self.Parameters.yt_token.data())

    def _send_notification(self, stage, msg):
        task_url = 'https://sandbox.yandex-team.ru/task/{}/view'.format(self.id)
        msg = (
            'Sandox task url: {}\n'
            'Toloka stage: {}\n'
            'Message: {}'
        ).format(task_url, stage, msg)
        self.server.notification(
            subject="MapsStreetviewPoiTolokaPipeline notification",
            body=msg,
            recipients=self.Parameters.notification_recipients,
            transport=notification.Transport.EMAIL,
        )

    def _prepare_panoramas(self):
        import yt.wrapper as yt
        yt.config['proxy']['url'] = 'hahn.yt.yandex.net'

        # init project layout
        for yt_path in ['', 'panoramas', 'first_task', 'second_task', 'third_task']:
            yt.create('map_node', path=str(self._project_yt_dir.subpath(yt_path)))

        bld_orgs_table = str(self._project_yt_dir.subpath('panoramas/bld_orgs'))
        bld_pano_ids_table = str(self._project_yt_dir.subpath('panoramas/bld_pano_ids'))
        markup_permalinks = str(self._project_yt_dir.subpath('panoramas/markup_permalinks'))

        yt.copy(self.Parameters.markup_permalinks, markup_permalinks)

        run_binary_cmd(
            self.Parameters.bld_orgs_binary,
            cmd=[
                '--ymapsdf-folder', self.Parameters.ymapsdf_folder,
                '--output-table', bld_orgs_table,
                '--input-permalinks', markup_permalinks,
            ]
        )

        pano_start_timestamp = int(time.time()) - self.Parameters.max_panoramas_age_days * SECONDS_PER_DAY
        run_binary_cmd(
            self.Parameters.pano_id_generation_binary,
            cmd=[
                '--input-table', bld_orgs_table,
                '--output-table', bld_pano_ids_table,
                '--pano-start-timestamp', str(pano_start_timestamp),
                '--history-folder', yt.ypath_join(self.Parameters.yt_root_path, 'results'),
            ]
        )
        if not yt.exists(bld_pano_ids_table):
            # create empty table for better understanding why task finished
            yt.create('table', bld_pano_ids_table)
            raise NoNewDataError

        raw_versions = self.ecstatic(
            ECSTATIC_ENV,
            [
                'versions',
                '--with-status',
                PANO_DESCRIPTION_DATASET,
                ECSTATIC_BRANCH
            ],
            do_auth=False,
        )
        version = sorted(re.findall(ECSTATIC_ACTIVE_VERSIONS_REGEX, raw_versions))[-1]
        if not version:
            raise errors.TaskError('Couldn\'t find active version of panoramas dataset in stable')
        else:
            logger.info('Downloading %s version of pano dataset', version)
        self.ecstatic(
            ECSTATIC_ENV,
            [
                'download', '{}={}'.format(PANO_DESCRIPTION_DATASET, version),
                '--out', PANO_DESCRIPTION_DATASET,
            ],
            do_auth=False,
        )

        run_binary_cmd(
            self.Parameters.preview_creator_binary,
            cmd=[
                '--pano-index-dir', PANO_DESCRIPTION_DATASET,
                '--width', PANO_WIDTH_PIX,
                '--height', PANO_HEIGHT_PIX,
                '--threads', '5',
                '--input-table', bld_pano_ids_table,
                '--output-table', str(self._project_yt_dir.subpath('panoramas/panoramas')),
            ]
        )

    def _prepare_task_data(self, toloka_stage, in_table, out_table):
        logger.info('Preparing input data for %s task', toloka_stage)
        cmd = [
            '{}-task'.format(toloka_stage),
            'prepare-input-data',
            '--input-table', in_table,
            '--output-table', out_table,
        ]
        run_binary_cmd(self.Parameters.toloka_tool_binary, cmd)

    def _create_task_golden_set(self, toloka_stage):
        environ['TOLOKA_TOKEN'] = self.Parameters.sandbox_toloka_token.data()
        task_yt_dir = self._project_yt_dir.subpath('{}_task'.format(toloka_stage.value))
        prepared_input_table = str(task_yt_dir.subpath('input_prepared'))
        use_sandbox = not self.Parameters.use_production_toloka
        if toloka_stage == TolokaStage.FIRST:
            prev_stage_results = str(self._project_yt_dir.subpath('panoramas/panoramas'))
            table_to_sample = self.Parameters.first_task_gs_table
            # NOTE: we need to have both type samples (with and without signs) in golden set, so no filtration applied
            filter_condition = None
            # NOTE: toloka golden set size shoud be approximately 10% of all data
            sample_percent = 10
            if not self.Parameters.create_new_toloka_projects:
                toloka_project_id = str(self.Parameters.first_task_proj_id_sb)
        elif toloka_stage == TolokaStage.SECOND:
            prev_stage_results = str(self._project_yt_dir.subpath('first_task/results'))
            table_to_sample = self.Parameters.second_task_gs_table
            filter_condition = 'DictLength(Yson::ConvertToDict(gt_orgs)) > 0'
            # NOTE: toloka golden set size shoud be approximately 10% of all data
            # but in this case we're going to filter empty golden samples after markup
            # thus initial golden set size should be bigger. 15 is empirical constant
            sample_percent = 15
            if not self.Parameters.create_new_toloka_projects:
                toloka_project_id = str(self.Parameters.second_task_proj_id_sb)
        else:
            prev_stage_results = str(self._project_yt_dir.subpath('second_task/results'))
            table_to_sample = self.Parameters.third_task_gs_table
            filter_condition = 'ListLength(Yson::ConvertToList(gt_polygons)) > 0'
            # read reason in comment above
            sample_percent = 12
            if not self.Parameters.create_new_toloka_projects:
                toloka_project_id = str(self.Parameters.third_task_proj_id_sb)

        if self.Parameters.sample_golden_set:
            with self.memoize_stage['{}_task_sample_gs'.format(toloka_stage.value)]:
                if toloka_stage != TolokaStage.FIRST:
                    self._prepare_task_data(
                        toloka_stage=toloka_stage.value,
                        in_table=prev_stage_results,
                        out_table=prepared_input_table,
                    )
                    prev_stage_results = prepared_input_table

                sample_golden_set(
                    task_yt_dir,
                    prev_stage_results,
                    table_to_sample,
                    sample_percent,
                    filter_condition,
                )
            return

        non_golden_input_table = str(task_yt_dir.subpath('input_non_golden'))
        golden_input_table = str(task_yt_dir.subpath('input_golden'))
        golden_results_table = str(task_yt_dir.subpath('golden_results'))

        with self.memoize_stage['{}_task_gs_markup'.format(toloka_stage.value)]:
            separate_golden_set(task_yt_dir, prev_stage_results, sample_percent)

            if toloka_stage != TolokaStage.FIRST:
                for table in [golden_input_table, non_golden_input_table]:
                    self._prepare_task_data(toloka_stage.value, in_table=table, out_table=table)

            cmd = [
                '--mds-bucket', PRIVATE_MDS_BUCKET,
                '{}-task'.format(toloka_stage.value),
                'create',
                '--data-table', golden_input_table,
                '--pool-name', str(self.Context.datetime_suffix),
            ]
            if self.Parameters.public_url_format:
                cmd[1] = PUBLIC_MDS_BUCKET
                cmd.append('--public-url-format')
            if not self.Parameters.create_new_toloka_projects:
                cmd.extend(['--project-id', toloka_project_id])

            tool_output = run_binary_cmd(self.Parameters.toloka_tool_binary, cmd)
            pool_url = re.findall(TOLOKA_TOOL_POOL_LINK_REGEX, tool_output)[0]
            self._send_notification(
                stage=toloka_stage.value,
                msg='Do the golden set markup at: {}'.format(pool_url),
            )
            self.Context.sb_pool_id = pool_url.split('/')[-1]
            self.Contex.sb_last_run_stage = toloka_stage.value

        if self.Context.sb_last_run_stage != toloka_stage.value:
            return

        if not check_toloka_pool_finished(self.Context.sb_pool_id, sandbox_toloka=use_sandbox):
            open_toloka_pool(self.Context.sb_pool_id, sandbox_toloka=use_sandbox)
            logger.info('wait for %s task golden set markup completion', toloka_stage.value)
            raise sdk2.WaitTime(self.Parameters.check_toloka_status_period)

        with self.memoize_stage['{}_task_gs_markup_process_results'.format(toloka_stage.value)]:
            logger.info('%s task golden set markup completed', toloka_stage.value)
            cmd = [
                '{}-task'.format(toloka_stage.value),
                'process-results',
                '--input-table', golden_input_table,
                '--output-table', golden_results_table,
                '--pool-id', self.Context.sb_pool_id
            ]
            run_binary_cmd(self.Parameters.toloka_tool_binary, cmd)
            if toloka_stage != TolokaStage.FIRST:
                filter_golden_set_table(golden_results_table, filter_condition)

    def _run_toloka_task(self, toloka_stage):
        import yt.wrapper as yt
        yt.config['proxy']['url'] = 'hahn.yt.yandex.net'

        environ['TOLOKA_TOKEN'] = self.Parameters.toloka_token.data()
        task_yt_dir = self._project_yt_dir.subpath('{}_task'.format(toloka_stage.value))
        use_sandbox = not self.Parameters.use_production_toloka
        max_spendings = self.Parameters.max_dollar_spendings
        input_table = str(task_yt_dir.subpath('input_non_golden'))
        output_table = str(task_yt_dir.subpath('non_golden_results'))
        if self.Parameters.sample_golden_set:
            if toloka_stage == TolokaStage.FIRST:
                input_table = str(self._project_yt_dir.subpath('panoramas/panoramas'))
            else:
                input_table = str(task_yt_dir.subpath('input_prepared'))
            output_table = str(task_yt_dir.subpath('results'))
        if toloka_stage == TolokaStage.FIRST:
            golden_results_table = str(task_yt_dir.subpath('golden_results'))
        else:
            golden_results_table = str(task_yt_dir.subpath('golden_results_filtered'))

        if toloka_stage == TolokaStage.FIRST:
            if not self.Parameters.create_new_toloka_projects:
                toloka_project_id = str(self.Parameters.first_task_proj_id)
        elif toloka_stage == TolokaStage.SECOND:
            if not self.Parameters.create_new_toloka_projects:
                toloka_project_id = str(self.Parameters.second_task_proj_id)
        else:
            if not self.Parameters.create_new_toloka_projects:
                toloka_project_id = str(self.Parameters.third_task_proj_id)

        with self.memoize_stage['{}_task_markup'.format(toloka_stage.value)]:
            cmd = [
                '--mds-bucket', PUBLIC_MDS_BUCKET,
                '{}-task'.format(toloka_stage.value),
                'create',
                '--public-url-format',
                '--users-quality', '1',
                '--data-table', input_table,
                '--gs-table', golden_results_table,
                '--pool-name', str(self.Context.datetime_suffix),
            ]
            if not use_sandbox:
                cmd.append('--production-toloka')
            if not self.Parameters.create_new_toloka_projects:
                cmd.extend(['--project-id', toloka_project_id])

            tool_output = run_binary_cmd(self.Parameters.toloka_tool_binary, cmd)
            pool_url = re.findall(TOLOKA_TOOL_POOL_LINK_REGEX, tool_output)[0]
            # NOTE: this variable will contain pool id for the __last__ task we ran
            self.Context.pool_id = pool_url.split('/')[-1]
            self.Context.last_run_stage = toloka_stage.value
            if not check_have_money_for_task(self.Context.pool_id, use_sandbox, max_spendings):
                self._send_notification(
                    stage=toloka_stage.value,
                    msg='Need money for task markup at: {}'.format(pool_url),
                )

        if self.Context.last_run_stage != toloka_stage.value:
            return

        if not check_have_money_for_task(self.Context.pool_id, use_sandbox, max_spendings):
            logger.info('wait for balance refill')
            raise sdk2.WaitTime(self.Parameters.check_toloka_status_period)

        if toloka_stage == TolokaStage.THIRD:
            logger.info('assessing third task')
            cmd = [
                'third-task',
                'assess-results',
                '--gs-table', golden_results_table,
                '--data-table', input_table,
                '--pool-id', self.Context.pool_id,
            ]
            if not use_sandbox:
                cmd.append('--production-toloka')
            run_binary_cmd(self.Parameters.toloka_tool_binary, cmd)

        if not check_toloka_pool_finished(self.Context.pool_id, use_sandbox):
            logger.info('wait for %s task markup completion', toloka_stage.value)
            open_toloka_pool(self.Context.pool_id, use_sandbox)

            raise sdk2.WaitTime(self.Parameters.check_toloka_status_period)

        with self.memoize_stage['{}_task_markup_process_results'.format(toloka_stage.value)]:
            cmd = [
                '{}-task'.format(toloka_stage.value),
                'process-results',
                '--input-table', input_table,
                '--output-table', output_table,
                '--pool-id', self.Context.pool_id
            ]
            if not use_sandbox:
                cmd.append('--production-toloka')
            if toloka_stage != TolokaStage.THIRD:
                cmd.append('--filter-bad-solutions')
            run_binary_cmd(self.Parameters.toloka_tool_binary, cmd)

            if not self.Parameters.sample_golden_set:
                logger.info('merging golden and non golden results')
                yt.run_merge(
                    [golden_results_table, output_table],
                    task_yt_dir.subpath('results')
                )

    def _finalize_data(self):
        import yt.wrapper as yt
        yt.config['proxy']['url'] = 'hahn.yt.yandex.net'

        output_table = str(self._project_yt_dir.subpath('pretty_results'))
        cmd = [
            '--project-folder', str(self._project_yt_dir),
            '--output-table', output_table,
        ]
        run_binary_cmd(self.Parameters.make_pretty_result_binary, cmd)

        link_path = yt.ypath_join(self.Parameters.yt_root_path, 'results', self.Context.datetime_suffix)
        yt.link(output_table, link_path)

    def _run_sign_area(self):
        import yt.wrapper as yt
        yt.config['proxy']['url'] = 'hahn.yt.yandex.net'

        output_table = str(self._project_yt_dir.subpath('sign_area_results'))
        third_task_results_table = str(self._project_yt_dir.subpath('third_task').subpath('results'))
        cmd = [
            '--third-task-results-path', third_task_results_table,
            '--output-table', output_table,
        ]
        run_binary_cmd(self.Parameters.sign_area_binary, cmd)

    def on_execute(self):
        self._setup_environment()

        with self.memoize_stage.prepare_panoramas_stage:
            try:
                self._prepare_panoramas()
            except NoNewDataError:
                self._send_notification(
                    stage='pano creation',
                    msg='no new panoramas found. finishing'
                )
                return

        try:
            self._create_task_golden_set(toloka_stage=TolokaStage.FIRST)
            self._run_toloka_task(TolokaStage.FIRST)

            self._create_task_golden_set(toloka_stage=TolokaStage.SECOND)
            self._run_toloka_task(TolokaStage.SECOND)

            self._create_task_golden_set(toloka_stage=TolokaStage.THIRD)
            self._run_toloka_task(TolokaStage.THIRD)

            self._run_sign_area()
            self._finalize_data()

            self._send_notification(
                stage='final',
                msg='pipeline for {} project finished'.format(self.Context.datetime_suffix)
            )
        except Exception as e:
            self._send_notification(stage='pipeline error', msg=str(e))
            raise
