# coding=UTF-8
import sys
from datetime import datetime

from sandbox.sandboxsdk.task import SandboxTask
from sandbox.sandboxsdk.svn import Arcadia
from sandbox.sandboxsdk import parameters, environments

BATCH_SIZE = 1500


class MrInputTableParam(parameters.ListRepeater, parameters.SandboxStringParameter):
    name = 'mr_input_table'
    description = 'Input MapReduce table(s) with queries'
    required = True
    group = 'MR'


class MrOutputTablePrefixParam(parameters.SandboxStringParameter):
    name = 'mr_output_prefix'
    description = 'Output MapReduce prefix'
    required = False
    group = 'MR'


class MrOutputTablePathParam(parameters.SandboxStringParameter):
    name = 'mr_output_path'
    description = 'Output MapReduce path'
    required = True
    group = 'MR'

    @classmethod
    def cast(cls, value):
        value = super(MrOutputTablePathParam, cls).cast(value)
        return value.rstrip('/')


class MrOutputTableNameParam(parameters.SandboxStringParameter):
    name = 'mr_output_table_name'
    description = 'Output MapReduce table name. If empty will use current date'
    required = False
    group = 'MR'


class MrServerParam(parameters.SandboxStringParameter):
    name = 'mr_server'
    description = 'MapReduce server'
    required = True
    group = 'MR'


class RegionParam(parameters.SandboxIntegerParameter):
    name = 'region'
    description = 'Region for all queries'
    required = True
    default_value = '225'
    group = 'Main'


class HostParam(parameters.SandboxStringParameter):
    name = 'host'
    description = 'Host to take serps from'
    required = True
    default_value = 'http://hamster.yandex.ru'
    group = 'Main'


class AuthParam(parameters.SandboxStringParameter):
    name = 'auth_vault'
    description = "Sandbox vault user (must contain 'scraper_user', 'scraper_oauth', 'yt_token' values)"
    required = True
    group = 'Auth'


class ScraperPageSizeParam(parameters.SandboxIntegerParameter):
    name = 'scraper_page_size_param'
    description = 'Scraper page size param'
    required = True
    default_value = 30
    group = 'Scraper'


class ScraperCgiParamLeft(parameters.ListRepeater, parameters.SandboxStringParameter):
    name = 'scraper_cgi_params_left'
    description = 'Scraper cgi parameters original'
    required = True
    group = 'Scraper'


class ScraperCgiParamRight(parameters.ListRepeater, parameters.SandboxStringParameter):
    name = 'scraper_cgi_params_right'
    description = 'Scraper cgi parameters filtered'
    required = True
    group = 'Scraper'


INTERSECT_DIFF_MODE = 'Intersect'
SUBTRACT_DIFF_MODE = 'Subtract'


class ScraperDiffParam(parameters.SandboxRadioParameter):
    name = 'scraper_diff_mode'
    description = 'Scraper diff mode'
    required = True
    choices = (
        ('Subtract', SUBTRACT_DIFF_MODE),
        ('Intersect', INTERSECT_DIFF_MODE)
    )
    default_value = SUBTRACT_DIFF_MODE
    group = 'Scraper'


class ImagesGrayQueriesFilter(SandboxTask):
    """
        Filtering gray images' queries, SEARCHSPAM-10062
    """

    type = 'IMAGES_GRAY_QUERIES_FILTER'

    input_parameters = [
        MrServerParam,
        MrInputTableParam,
        MrOutputTablePathParam,
        MrOutputTablePrefixParam,
        MrOutputTableNameParam,
        RegionParam,
        HostParam,
        AuthParam,
        ScraperPageSizeParam,
        ScraperCgiParamLeft,
        ScraperCgiParamRight,
        ScraperDiffParam
    ]
    environment = (
                environments.PipEnvironment('yandex-yt', '0.8.11-0'),
                environments.PipEnvironment('yandex-yt-yson-bindings-skynet', '0.3.7.post1'),
            )

    local_dir = 'local_files'
    scraper_task_name = 'sandbox:images gray queries filter'
    scraper_preset_name = 'yandex-images-profiled'

    @staticmethod
    def _cast_cgi_params(cgi_list):
        result = []
        for val in cgi_list:
            if not val.strip():
                continue
            result.append(val.split(u'=', 1))
        return result

    def _get_queries_from_mapreduce(self, yt_client):
        result = set()

        for table in self.ctx[MrInputTableParam.name]:
            for item in yt_client.read_table(table, format='yson', raw=False):
                query = item.get('query')
                if query:
                    result.add(query)
        return result

    def _send_docs_to_mapreduce(self, yt_client, found):
        urls = set()
        for query, docs in found.iteritems():
            for pos, url in docs:
                urls.add(url)
        result = [{'url': url} for url in urls]

        table = '{}/{}{}'.format(
            self.ctx[MrOutputTablePathParam.name],
            self.ctx[MrOutputTablePrefixParam.name] if self.ctx[MrOutputTablePrefixParam.name] else '',
            self.ctx[MrOutputTableNameParam.name]
        )

        yt_client.write_table(table, result)
        yt_client.run_sort(table, sort_by='url')

    def on_enqueue(self):
        value = self.ctx[MrOutputTableNameParam.name]
        if not value or not value.strip():
            value = datetime.today().strftime('%Y-%m-%d')
        self.ctx[MrOutputTableNameParam.name] = value
        SandboxTask.on_enqueue(self)

    def on_execute(self):
        ImagesGrayQueriesFilter._setup_environment()

        from erf_monitor.find_by_filter import find_by_cgi
        from yt.wrapper import YtClient

        server = self.ctx[MrServerParam.name]
        token = self.get_vault_data(self.ctx[AuthParam.name], 'yt_token')
        yt_client = YtClient(server, token, {'yamr_mode': {'treat_unexisting_as_empty': True}})

        queries = self._get_queries_from_mapreduce(yt_client)

        scraper_user = self.get_vault_data(self.ctx[AuthParam.name], 'scraper_user')
        oauth_token = self.get_vault_data(self.ctx[AuthParam.name], 'scraper_oauth')

        requests = [{'query-text': query} for query in queries]
        region = self.ctx[RegionParam.name]
        cgi_params_left = ImagesGrayQueriesFilter._cast_cgi_params(self.ctx[ScraperCgiParamLeft.name])
        cgi_params_right = ImagesGrayQueriesFilter._cast_cgi_params(self.ctx[ScraperCgiParamRight.name])
        found, request_count = find_by_cgi(scraper_user, oauth_token, self.scraper_task_name, requests,
                                           cgi_params_left,
                                           cgi_params_right=cgi_params_right,
                                           region=region,
                                           host=self.ctx[HostParam.name],
                                           page_size=self.ctx[ScraperPageSizeParam.name],
                                           scraper_preset=self.scraper_preset_name,
                                           diff=self.ctx[ScraperDiffParam.name] == SUBTRACT_DIFF_MODE,
                                           batch_size=BATCH_SIZE)

        self._send_docs_to_mapreduce(yt_client, found)

    @staticmethod
    def _setup_environment():
        sys.path.append(Arcadia.get_arcadia_src_dir('arcadia:/arc/trunk/arcadia/yweb/antispam/util'))
        sys.path.append(Arcadia.get_arcadia_src_dir('arcadia:/arc/trunk/arcadia/yweb/antiporno/scripts'))  # erf_monitor


__Task__ = ImagesGrayQueriesFilter
