# coding=UTF-8

import os
from datetime import datetime, timedelta

from sandbox.sandboxsdk import environments, parameters
from sandbox.sandboxsdk.paths import copy_path
from sandbox.sandboxsdk.process import run_process
from sandbox.sandboxsdk.task import SandboxTask
from sandbox.sandboxsdk.svn import Arcadia


SCRIPT_NAME = 'collect_queries.py'
DATE_FORMAT = '%Y-%m-%d'

YSON_BINDINGS_VERSION = '0.3.7-1'


class MrOutputTablePrefixParam(parameters.SandboxStringParameter):
    name = 'mr_output_table'
    description = 'Output MapReduce table prefix. Mode name is used for suffix'
    required = True
    group = 'MR'


class MrServerParam(parameters.SandboxStringParameter):
    name = 'mr_server'
    description = 'MapReduce server'
    required = True
    group = 'MR'


class YtAuthParam(parameters.SandboxStringParameter):
    name = 'auth_vault'
    description = "Sandbox vault user (must contain 'yt_token' value)"
    required = True
    group = 'Auth'


class MrUserSessionsPath(parameters.SandboxStringParameter):
    name = 'mr_user_sessions_path'
    description = 'Path to user_sessions tables'
    required = True
    group = 'MR'
    default_value = 'user_sessions'

    @classmethod
    def cast(cls, value):
        value = super(MrUserSessionsPath, cls).cast(value)
        return value.rstrip('/')


class RegionParam(parameters.SandboxStringParameter):
    name = 'region'
    description = 'Region bigram'
    required = True
    default_value = 'ru'
    group = 'Main'


class FromDateParam(parameters.SandboxStringParameter):
    name = 'from_date'
    description = 'Starting date to mine. By default mine previous week'
    required = False
    group = 'Date'

    @classmethod
    def cast(cls, value):
        value = super(FromDateParam, cls).cast(value)
        if not value.strip():
            return ''
        return datetime.strptime(value, DATE_FORMAT).strftime(DATE_FORMAT)


class ToDateParam(parameters.SandboxStringParameter):
    name = 'to_date'
    description = 'Ending date to mine'
    required = False
    group = 'Date'

    @classmethod
    def cast(cls, value):
        value = super(ToDateParam, cls).cast(value)
        if not value.strip():
            return ''
        return datetime.strptime(value, DATE_FORMAT).strftime(DATE_FORMAT)


class MiningModeParam(parameters.DictRepeater, parameters.SandboxStringParameter):
    OPTIONS = [
        '--web-gray', '--web-child', '--web-white', '--web-tumblr',
        '--img-white', '--img-gray', '--img-child', '--img-person',
        '--video-white'
    ]

    name = 'mining_mode'
    description = 'Mining mode and amount. Available modes: ' + ', '.join(OPTIONS)
    required = True
    group = 'Main'

    @classmethod
    def cast(cls, value):
        value = super(MiningModeParam, cls).cast(value)
        result = {}
        for k, v in value.iteritems():
            if k not in MiningModeParam.OPTIONS:
                raise KeyError('Option {} is not available'.format(k))
            result[k] = int(v)
        return result


class PoolNameParam(parameters.SandboxStringParameter):
    name = 'yt_pool_name'
    description = 'YT pool name'
    required = False
    group = 'MR'


class CollectTopQueries(SandboxTask):
    """
        Collect top nasty queries, SEARCHSPAM-10062, SEARCHSPAM-10590
    """

    type = 'COLLECT_TOP_QUERIES'

    input_parameters = [
        MrServerParam,
        YtAuthParam,
        PoolNameParam,
        MrOutputTablePrefixParam,
        MrUserSessionsPath,
        RegionParam,
        MiningModeParam,
        FromDateParam,
        ToDateParam,
    ]
    environment = (
        environments.PipEnvironment('yandex-yt', "0.8.11-0"),
    )

    def on_enqueue(self):
        from_date_str = self.ctx[FromDateParam.name]
        to_date_str = self.ctx[ToDateParam.name]

        if not to_date_str or not to_date_str.strip():
            to_date = datetime.today() - timedelta(days=1)
            self.ctx[ToDateParam.name] = to_date.strftime(DATE_FORMAT)
        else:
            to_date = datetime.strptime(to_date_str, DATE_FORMAT)
        if not from_date_str or not from_date_str.strip():
            from_date = to_date - timedelta(weeks=1)
            self.ctx[FromDateParam.name] = from_date.strftime(DATE_FORMAT)

        SandboxTask.on_enqueue(self)

    def on_execute(self):
        self._setup_environment()

        tables = []
        start = datetime.strptime(self.ctx[FromDateParam.name], DATE_FORMAT)
        end = datetime.strptime(self.ctx[ToDateParam.name], DATE_FORMAT)

        if start > end:
            end = start

        while start <= end:
            table_name = '{}/{}/clean'.format(self.ctx[MrUserSessionsPath.name], start.strftime('%Y-%m-%d'))
            tables.extend(['--src', table_name])
            start += timedelta(days=1)

        modes = []
        for mode, amount in self.ctx[MiningModeParam.name].iteritems():
            dst = '{prefix}_{mode}'.format(
                prefix=self.ctx[MrOutputTablePrefixParam.name],
                mode=mode.lstrip('--').replace('-', '_')
            )
            modes = modes + [mode, dst, str(amount)]

        run_process(
            [
                'pip', 'install',
                'yandex-yt-yson-bindings={}'.format(YSON_BINDINGS_VERSION),
                '--user'
            ],
            log_prefix='pip_install_yandex-yt-yson-bindings_{}'.format(YSON_BINDINGS_VERSION)
        )

        run_process(
            [
                'python',
                SCRIPT_NAME,
                '--server', self.ctx[MrServerParam.name],
                '-r', self.ctx[RegionParam.name],
                '--yt-pool', self.ctx[PoolNameParam.name],
            ] + modes + tables,
            log_prefix='collect_queries'
        )

    def _setup_environment(self):
        CollectTopQueries._import_from_arcadia('arcadia:/arc/trunk/arcadia/quality/logs/parse_lib_py/', 'parselib.py')
        CollectTopQueries._import_from_arcadia('arcadia:/arc/trunk/arcadia/yweb/antiporno/scripts/collect_queries/',
                                               SCRIPT_NAME)
        self._provide_yt_token()

    def _provide_yt_token(self):
        yt_token_dir = os.path.join(os.path.expanduser('~'), '.yt')
        if not os.path.exists(yt_token_dir):
            os.mkdir(yt_token_dir)
        with open(os.path.join(yt_token_dir, 'token'), 'w') as fo:
            yt_token = self.get_vault_data(self.ctx[YtAuthParam.name], 'yt_token')
            fo.write(yt_token)

    @staticmethod
    def _import_from_arcadia(path, file_name):
        copy_path(
            os.path.join(
                Arcadia.get_arcadia_src_dir(path),
                file_name
            ),
            file_name
        )


__Task__ = CollectTopQueries
