# -*- coding: utf-8 -*-

import os
import logging
import random
import urllib2
from shutil import copyfile

from sandbox.sandboxsdk.task import SandboxTask
from sandbox.sandboxsdk.errors import SandboxTaskFailureError
from sandbox.sandboxsdk import process
from sandbox.sandboxsdk.svn import Arcadia
from sandbox.sandboxsdk.parameters import SandboxStringParameter, SandboxBoolParameter

from sandbox.projects.common.wizard.wizard_builder import WizardBuilder

from sandbox.projects.common import apihelpers

_LANG_GROUPS = "Query types"


def _zone_parameter(tld):
    class Zone(SandboxBoolParameter):
        name = 'zone_' + tld.replace('.', '_')
        description = tld
        default_value = True
        group = _LANG_GROUPS
    return Zone


LangRu = _zone_parameter('ru')
LangUa = _zone_parameter('ua')
LangKz = _zone_parameter('kz')
LangBy = _zone_parameter('by')
LangCom = _zone_parameter('com')
LangComTr = _zone_parameter('com.tr')


class LangKUB(SandboxBoolParameter):
    name = 'generate_kub'
    description = 'KUB'
    default_value = True
    group = _LANG_GROUPS


class LangLight(SandboxBoolParameter):
    name = 'generate_light'
    description = 'light'
    default_value = True
    group = _LANG_GROUPS


LANGS = [LangRu, LangUa, LangKz, LangBy, LangCom, LangComTr, LangKUB, LangLight]


class Hostname(SandboxStringParameter):
    name = 'hostname'
    description = 'Host name'


class HostType(SandboxStringParameter):
    name = 'host_type'
    choices = [
        ('wizard', 'wizard'),
        ('entitysearch', 'entitysearch_test'),
        ('lingvoboost', 'lingvoboost'),
    ]
    sub_fields = {
        'wizard': [lang.name for lang in LANGS],
        'entitysearch_test': [lang.name for lang in LANGS],
    }
    default_value = 'wizard'
    description = 'Host type'


class WizardLogs(SandboxTask):
    type = 'WIZARD_LOGS'

    input_parameters = [Hostname, HostType] + LANGS

    total_limit = 100000
    daily_limit = 100000

    @staticmethod
    def _get_lines_num(filename):
        return sum(1 for _ in open(filename))

    @staticmethod
    def get_long_tail_queries(all_queries_path, lang):
        long_tail_queries_path = '{}_long_tail.txt'.format(lang)
        with open(long_tail_queries_path, 'w') as long_tail_queries_file:
            for line in open(all_queries_path):
                for part in line.split('&'):
                    if part.startswith('text=') and len(urllib2.unquote(part)) > 85:
                        long_tail_queries_file.write(line)
        return long_tail_queries_path

    def store_queries(self, queries_filename, lang, long_version=True):
        self.create_resource(
            '{} queries from daily log ({} lines)'.format(lang, self._get_lines_num(queries_filename)),
            queries_filename,
            'PLAIN_TEXT_QUERIES',
            attributes={'{}_log_queries'.format(self.ctx[HostType.name]): lang}
        )
        if not long_version:
            return
        long_queries_path = self.get_long_tail_queries(queries_filename, lang)
        self.create_resource(
            '{} long queries from daily log ({} lines)'.format(lang, self._get_lines_num(long_queries_path)),
            long_queries_path,
            'PLAIN_TEXT_QUERIES',
            attributes={'{}_log_long_queries'.format(self.ctx[HostType.name]): lang}
        )

    def store_kub_queries(self, stored_queries):
        kub_queries_path = 'kub_queries.txt'
        with open(kub_queries_path, 'w') as kub_queries:
            for lang in ('kz', 'ua', 'by'):
                for i, line in enumerate(open(stored_queries[lang])):
                    if i > 100000:
                        break
                    kub_queries.write(line)
        self.store_queries(kub_queries_path, 'kub')

    def store_light_queries(self, stored_queries):
        light_queries_path = 'light_queries.txt'
        with open(light_queries_path, 'w') as light_queries:
            for i, req in enumerate(open(stored_queries['ru'])):
                if i < 25000:
                    light_queries.write(req.replace('\n', '') + '&rn=light\n')
                elif i < 50000:
                    light_queries.write(req.replace('\n', '') + '&rn=superlight\n')
                else:
                    break
        self.store_queries(light_queries_path, 'light', long_version=False)

    def get_langs_to_process(self):
        if self.ctx[HostType.name] == 'lingvoboost':
            return ["no_reg"]
        langs = []
        for key in self.ctx:
            if key.startswith('zone_'):
                if self.ctx[key]:
                    langs.append(key.split('zone_')[-1].replace('_', '.'))
        return langs

    @staticmethod
    def get_entitysearch_port(host):
        groups = {
            'MSK_FOL': 11600,
            'SAS': 8400,
            'MAN': 13000,
        }

        for group in groups:
            if host.split('.')[0] in urllib2.urlopen(
                'http://api.gencfg.yandex-team.ru/trunk/searcherlookup/groups/{}_ENTITYSEARCH_WIZARD/instances'.format(group)
            ).read():
                return groups[group]

        raise SandboxTaskFailureError('no port')

    def select_random_host(self):
        from library.sky.hostresolver import Resolver

        if self.ctx[HostType.name] == 'wizard':
            resolve_cmd = 'I@a_prj_search-wizard'
        elif self.ctx[HostType.name] == 'lingvoboost':
            resolve_cmd = 'I@MSK_WIZARD_LINGVOBOOST_WIZARD'
        else:
            resolve_cmd = 'I@a_tier_EntitysearchTier0'
        self.ctx[Hostname.name] = random.choice(list(Resolver().resolveHosts(resolve_cmd)))

    def get_remote_eventlog_name(self, host):
        evlog_name_template = 'current-eventlog-wizard-{}'
        if self.ctx[HostType.name] in ('wizard', 'lingvoboost'):
            port = 8891
        else:
            port = self.get_entitysearch_port(host)
        return evlog_name_template.format(port)

    def on_execute(self):
        if self.ctx[HostType.name] == 'lingvoboost':
            for lang in LANGS:
                self.ctx[lang.name] = False
        script_svn = 'arcadia:/arc/trunk/arcadia/web/wizard_tools/armory/get_queries.py'
        get_queries = self.abs_path('get_queries.py')
        Arcadia.export(script_svn, get_queries)

        logging.info('Getting evlogdump....')
        evlogdump = self.sync_resource(WizardBuilder.evlogdump_from_task(WizardBuilder.get_production_wizard_task().id))

        logging.info('Getting host....')
        if not self.ctx.get(Hostname.name):
            self.select_random_host()

        logging.info('rsync logs from %s', self.ctx[Hostname.name])
        rsync_cmd = [
            'rsync',
            '-avz',
            '{}::logs/{}'.format(self.ctx[Hostname.name], self.get_remote_eventlog_name(self.ctx[Hostname.name])),
            'eventlog',
        ]
        process.run_process(rsync_cmd, log_prefix='rsync')

        logging.info('Dumping logs')
        evlogdump_cmd = "{} eventlog | {}".format(evlogdump, get_queries)
        process.run_process(evlogdump_cmd, shell=True, log_prefix='get_queries')

        langs = self.get_langs_to_process()
        logging.info('langs to process: %s', langs)

        stored_queries = {}
        for lang in langs:
            queries = self.abs_path('reqs_%s.txt' % lang)
            old_queries = apihelpers.get_last_resource_with_attribute(
                'PLAIN_TEXT_QUERIES', '{}_log_queries'.format(self.ctx[HostType.name]), lang
            )
            logging.info('lang is %s', lang)
            if old_queries:
                logging.info("resource found")
                old_queries_filename = self.abs_path('old_queris_%s.txt' % lang)
                copyfile(self.sync_resource(old_queries.id), old_queries_filename)
                os.chmod(old_queries_filename, 0777)
                if self._get_lines_num(old_queries_filename) > self.total_limit:
                    limit_exceeding = self._get_lines_num(old_queries_filename) - self.total_limit
                    logging.info('limit exceeding is %s', limit_exceeding)

                    lines_new_queries = self._get_lines_num(queries)
                    logging.info('lines in new queries is %s', lines_new_queries)

                    num = 0
                    new_queries_filename = 'new_queries_%s_%s.txt' % (lang, self.id)
                    with open(self.abs_path(new_queries_filename), 'w') as new_queries:
                        for line in open(old_queries_filename):
                            num += 1
                            if num >= limit_exceeding + min(self.daily_limit, lines_new_queries):
                                new_queries.write(line)
                    logging.info('%s old queries', num)
                    old_queries_filename = new_queries_filename
                mode = 'a'
            else:
                logging.info("resource not found")
                old_queries_filename = self.abs_path('old_queries_%s.txt' % lang)
                mode = 'w'

            lines = self._get_lines_num(queries)
            if not lines:
                logging.info("no new logs!")
            else:
                line_nums = {random.randrange(lines) for _ in xrange(self.daily_limit)}
                num = 0
                with open(old_queries_filename, mode) as old_queries_file:
                    for line in open(queries):
                        if num in line_nums:
                            old_queries_file.write(line)
                        num += 1
                logging.info('%s new queries', num)

            self.store_queries(old_queries_filename, lang)
            stored_queries[lang] = old_queries_filename

        if self.ctx.get(LangKUB.name):
            self.store_kub_queries(stored_queries)

        if self.ctx.get(LangLight.name):
            self.store_light_queries(stored_queries)


__Task__ = WizardLogs
