import codecs
import collections
import itertools
import json
import logging
import time
import urllib
import urlparse
import os
import sys
from multiprocessing import cpu_count
from multiprocessing.pool import ThreadPool

import sandbox.sandboxsdk.parameters as sdk_parameters
import sandbox.projects.common.bno.juggler as juggler
from sandbox.projects.common import utils
from sandbox.projects.common.bno.mds import Mds
from sandbox.projects.common.bno.params import EnvTypeParam
from sandbox.projects.common.bno.resources import save_resource
from sandbox.projects.common.bno.utils import run_cmd
from sandbox.projects.common.vcs import arc
from sandbox.projects.resource_types import BNO_QUERYDATA_INDEXER_EXECUTABLE, BNO_QUERYDATA_TRIE, BNO_GEMINICL_EXECUTABLE
from sandbox.sandboxsdk import environments
from sandbox.sandboxsdk.task import SandboxTask
from sandbox.sandboxsdk.svn import Arcadia


def prepare():
    Arcadia.export('arcadia:/arc/trunk/arcadia/quality/functionality/content_plugins/scripts/regular_updates/converters/sideblock_hosts_config.py', './')
    Arcadia.export('arcadia:/arc/trunk/arcadia/quality/functionality/content_plugins/scripts/regular_updates/converters/sideblock_shims.py', './')
    Arcadia.export('arcadia:/arc/trunk/arcadia/quality/functionality/content_plugins/scripts/regular_updates/converters/base.py', './')
    sys.path.append("./")


def flatten(l):
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, basestring):
            for sub in el:
                yield sub
        else:
            yield el


def to_unicode(value):
    if isinstance(value, unicode):
        return value
    return unicode(value, 'utf-8')


class DeployHostsUrlParam(sdk_parameters.SandboxStringParameter):
    name = 'deploy_hosts_url'
    description = 'Url to get list of deploy hosts'
    default_value = 'http://querydata.n.yandex-team.ru/get_trie_hosts'
    required = True


class SaasValidKeysParam(sdk_parameters.SandboxStringParameter):
    name = 'saas_valid_keys'
    description = 'MapReduce table with saas valid keys'
    default_value = 'hahn://home/content_plugins/sideblock_trie/saas_valid_keys'
    required = True


class MrPathParam(sdk_parameters.ListRepeater, sdk_parameters.SandboxStringParameter):
    name = 'mr_path'
    description = 'MapReduce tables path'
    default_value = [
        'hahn://home/extdata/aurora/others/general/russianfood_com',
        'hahn://home/extdata/aurora/others/general/povar_ru',
        'hahn://home/extdata/aurora/others/general/eda_ru',
        'hahn://home/extdata/aurora/others/general/edimdoma_ru',
        'hahn://home/extdata/aurora/others/general/povarenok_ru',
        'hahn://home/extdata/aurora/others/general/1000_menu_ru'
    ]
    required = True


class VaultTokenItemNameParam(sdk_parameters.SandboxStringParameter):
    name = 'vault_item'
    description = 'Vault item name for mr token'
    default_value = 'mrtoken'
    required = True


class MdsTypeParam(sdk_parameters.SandboxRadioParameter):
    name = 'mds_type'
    description = 'MDS Environment'
    default_value = 'beta'
    choices = [(v, v) for v in 'production beta'.split()]


class UploadToSaas(sdk_parameters.SandboxBoolParameter):
    name = 'upload_to_saas'
    description = 'upload data to saas instead of querysearch'
    default_value = False


class SaasNamespace(sdk_parameters.SandboxStringParameter):
    name = 'saas_namespace'
    description = 'saas namespace'
    default_value = 'shinyserp_bno_recipe_%s:docid_setprops'


class SaasHost(sdk_parameters.SandboxStringParameter):
    name = 'saas_host'
    description = 'saas host'
    default_value = 'fastsnips.ferryman.n.yandex-team.ru'


class SaasRootFolder(sdk_parameters.SandboxStringParameter):
    name = 'saas_root_folder'
    description = 'saas root folder'
    default_value = '//home/search-functionality/qdsaas/'


class JugglerHostParam(sdk_parameters.SandboxStringParameter):
    name = 'juggler_host'
    description = 'If specified, send monitoring events to juggler on this hostname'
    default_value = ''


class BnoRecipesBuildTask(SandboxTask):
    type = 'BNO_RECIPES_BUILD'

    cores = 1
    required_ram = 1024

    input_parameters = [
        MrPathParam,
        VaultTokenItemNameParam,
        EnvTypeParam,
        DeployHostsUrlParam,
        MdsTypeParam,
        SaasValidKeysParam,
        UploadToSaas,
        SaasNamespace,
        SaasHost,
        SaasRootFolder,
        JugglerHostParam,
    ]

    environment = (
        environments.PipEnvironment('yandex-yt'),
        environments.PipEnvironment("yandex-yt-yson-bindings-skynet"),
        environments.PipEnvironment('requests'),
        environments.PipEnvironment('typing')
    )

    _adapter = 'bno_recipe'
    _keysemantic = 'snipdocid'
    _content_plugin = {
        'content_plugin': True,
        'disabled': False,
        'SerpInfo': {
            'type': 'construct',
            'format': 'json',
            'template': None,
        },
        'SerpData': {
            'type': None,
            'data': None
        }
    }
    _content_plugin_1 = {
        'content_plugin': True,
        'disabled': True,
        'allowed_positions': [0],
        'SerpInfo': {
            'type': 'construct',
            'format': 'json',
            'template': None,
        },
        'SerpData': {
            'type': None,
            'data': None
        }
    }
    _debug_yt_suffix = ''  # '[:#20]'
    _debug_avatar_images = True

    def __init__(self, task_id=0):
        SandboxTask.__init__(self, task_id)

    @staticmethod
    def get_data_kind(data):
        result = []
        if not ('first_nav_block' in data):
            result += ['first', 'first_1']
        if 'first_nav_block' in data:
            result += ['second', 'second_1']
        if 'first_nav_block' in data and 'recipes' in data:
            # old: result += ['third', 'third_1']
            result += ['first', 'first_1']
        return result

    def avatar_image(self, image):
        try:
            src_image = 'https:' + image if image.startswith('//') else image
            return image, self.mds.upload(src_image)
        except:
            logging.warn(image, exc_info=True)
        return image, None

    def map_images(self, data, key, pool):
        if key not in data:
            return
        images = [item['img'] for item in data[key] if 'img' in item]
        mapped = pool.map(lambda im: self.avatar_image(im), images)
        image2avatar = dict(item for item in mapped if item[1])
        valid_items = []
        for i in data[key]:
            if 'img' not in i:
                continue

            if i['img'] in image2avatar:
                i['img'] = image2avatar[i['img']]
                valid_items.append(i)
        data[key] = valid_items

    def format_item(self, kind, data):
        plugin = self._content_plugin_1.copy() if kind.endswith('_1') else self._content_plugin.copy()
        striped_kind = kind[:-2] if kind.endswith('_1') else kind
        plugin['SerpData']['data'] = data
        plugin['SerpData']['type'] = self._adapter + '_' + striped_kind
        plugin['SerpInfo']['template'] = self._adapter + '_' + striped_kind

        adapter = self._adapter + '_' + kind
        return data.get('url'), kind, 'Snippet={"%s": %s}' % (adapter, json.dumps(plugin, ensure_ascii=False)),

    @staticmethod
    def hl_name(data, key):
        if key not in data:
            return
        for i in data[key]:
            if 'name' not in i:
                continue
            i['name'] = {'__hl': i['name'], 'rows': 0.5, 'phonerows': [2, 0.5]}

    @staticmethod
    def shorten_result_list(data, key, maxcount):
        if key not in data:
            return
        data[key] = data[key][:maxcount]

    @staticmethod
    def remove_rating(data, key):
        if key not in data:
            return

        for i in data[key]:
            i.pop('rating', None)

    def get_shares(self, kind, attempts=5, sleep=10):
        error = Exception("Undefined internal error")
        trie_name = self._adapter + "_" + kind + ".trie"
        base_url = self.ctx[DeployHostsUrlParam.name]
        url = base_url + "/" + trie_name
        while attempts > 0:
            try:
                return urllib.urlopen(url).read().split()
            except Exception, e:
                error = e
                attempts -= 1
                logging.error("Cannot get shares. Attempts left: %d. Sleep: %ds" % (attempts, sleep))
                time.sleep(sleep)
        raise error

    def modify_json(self, data, key):
        self.shorten_result_list(data, key, 7)
        self.hl_name(data, key)
        self.remove_rating(data, key)

    def modify_recipes_json(self, data, saas_keys):
        # Leave only 7 recipes with turbo.
        if 'recipes' in data:
            recipes = []
            for recipe in data['recipes']:
                if recipe['url'] in saas_keys:
                    recipes.append(recipe)
                    if len(recipes) >= 7:
                        break
            data['recipes'] = recipes
        self.hl_name(data, 'recipes')
        self.remove_rating(data, 'recipes')

    def add_sideblock_fields(self, data, saas_keys):
        if 'recipes' not in data:
            return
        for item in data['recipes']:
            saas_key = saas_keys[item['url']]
            item['sideblock_url'] = '/search/cache/touch?'
            item['sideblock_cgi_url'] = saas_key
            item['sideblock_fallback'] = item['url']
            item['can_show_in_sideblock'] = True

    def iter_items(self, item, decoder, pool, saas_keys):
        # noinspection PyBroadException
        try:
            data = decoder.decode(item)
            kinds = self.get_data_kind(data)
            if not kinds:
                return

            if self._debug_avatar_images:
                self.map_images(data, 'recipes', pool)
                self.map_images(data, 'first_nav_block', pool)
                self.map_images(data, 'second_nav_block', pool)

            self.modify_recipes_json(data, saas_keys)

            if not data['recipes']:
                return

            self.modify_json(data, 'first_nav_block')
            self.modify_json(data, 'second_nav_block')
            self.add_sideblock_fields(data, saas_keys)

            for kind in kinds:
                yield self.format_item(kind, data)

        except:
            logging.warn(item, exc_info=True)

    def yt_read_table(self, token, table, columns=None):
        from yt.wrapper import YtClient, TablePath
        parts = table.split(':')
        client = YtClient(parts[0], token)
        path = TablePath(parts[1] + self._debug_yt_suffix, columns=columns)
        return client.read_table(path, format='yson', raw=False)

    def read_saas_keys(self, token):
        from sideblock_hosts_config import allow_sideblock_in_snippet_gallery
        saas_keys = dict()
        for rec in self.yt_read_table(token, self.ctx[SaasValidKeysParam.name],
                                      columns=["url", "meta"]):
            url = rec["url"]
            if not allow_sideblock_in_snippet_gallery(url):
                continue
            saas_key = rec["meta"]["saas_key"]
            saas_keys[url] = saas_key
            host = urlparse.urlsplit(url).hostname
            if host == 'eda.ru' or host == 'www.eda.ru':
                saas_keys[url + '?from=recipescatalog'] = saas_key
        return saas_keys

    def extract_data(self, token, tables, pool):
        saas_keys = self.read_saas_keys(token)
        decoder = json.JSONDecoder(strict=False)
        t = itertools.chain.from_iterable(self.yt_read_table(token, table) for table in tables)
        data = (to_unicode(item['value']) for item in t)
        data = itertools.chain.from_iterable(self.iter_items(item, decoder, pool, saas_keys) for item in data)
        url_path = self.path(self._adapter + '.data')
        return save_resource(self, data=data, path=url_path)

    @staticmethod
    def prepare_url_data(kind, path):
        with codecs.open(path, 'r', 'utf-8') as infile:
            for line in infile:
                data = line.split('\t')
                if data[1] != kind:
                    continue
                yield data[0]

    def canonize_urls(self, kind, src_path):
        tool = utils.sync_last_stable_resource(BNO_GEMINICL_EXECUTABLE, arch='linux')
        path = self.path(self._adapter + "_" + kind + ".gemini.json")

        # adding mobile urls
        mobile_to_desktop_url = {}
        mobile_urls_json_path = self.path(self._adapter + "_" + kind + ".mobile_urls.json")
        mobile_urls_path = self.path(self._adapter + "_" + kind + ".mobile_urls")
        run_cmd([tool, "--format", "json", "--type", "desktop2mobile", "-f", src_path, ">", mobile_urls_json_path])
        save_resource(self, path=mobile_urls_json_path)
        with codecs.open(mobile_urls_json_path, 'r', encoding="utf-8") as f:
            data = (json.loads(item) for item in f)
            data = (item['Response'] for item in data if 'Response' in item and 'Error' not in item['Response'])
            data = ((item['OriginalUrl'], item['MainUrl'][0]) for item in data if item['OriginalUrl'] != item['MainUrl'][0])
            urls = list(data)
            for (desktop_url, mobile_url) in urls:
                mobile_to_desktop_url[mobile_url] = desktop_url
            urls = (mobile_url for (desktop_url, mobile_url) in urls)
            save_resource(self, data=urls, path=mobile_urls_path)

        dst_path = self.path(self._adapter + "_" + kind + '.gemini')
        run_cmd([tool, "--format", "json", "--type", "search_doc_id", "-f", src_path, '>', path])
        run_cmd([tool, "--format", "json", "--type", "search_doc_id", "-f", mobile_urls_path, '>>', path])

        save_resource(self, path=path)
        with codecs.open(path, 'r', encoding="utf-8") as f:
            data = (json.loads(item) for item in f)
            data = (item['Response'] for item in data if 'Response' in item and 'Error' not in item['Response'])
            data = ((item['OriginalUrl'], item['CanonizedUrl'], item['MainUrl'][0]) for item in data)
            urls = list(data)
            dict_urls = {}
            use_canonized = self.ctx[UploadToSaas.name]
            for original_url, canonized_url, main_url in urls:
                url = original_url
                if url in mobile_to_desktop_url:
                    url = mobile_to_desktop_url[url]
                if url not in dict_urls:
                    dict_urls[url] = []
                key = canonized_url if use_canonized else main_url
                dict_urls[url].append(key)
            return dict_urls, save_resource(self, data=urls, path=dst_path)

    def build_keys(self, kind, path):
        data = self.prepare_url_data(kind, path)

        filename = self.path(self._adapter + "_" + kind + ".url")
        urls = save_resource(self, data=data, path=filename)
        return self.canonize_urls(kind, urls.path)

    @staticmethod
    def prepare_trie_data(url2key, kind, path):
        with codecs.open(path, "r", "utf-8") as infile:
            for line in infile:
                data = line.split('\t')
                if data[1] != kind:
                    continue
                keys = url2key.get(data[0])
                if not keys:
                    continue
                for key in keys:
                    yield key, data[2]

    def save_trie_data(self, url2key, kind, path):
        filename = self.path(self._adapter + "_" + kind + ".trie.data")
        data = self.prepare_trie_data(url2key, kind, path)
        return save_resource(self, data=data, path=filename)

    def build_trie(self, kind, data):
        tool = utils.sync_last_stable_resource(BNO_QUERYDATA_INDEXER_EXECUTABLE, arch='linux')
        path = self.path('deploy.' + kind)
        if not os.path.exists(path):
            os.makedirs(path)
        trie_name = self._adapter + '_' + kind
        trie_path = trie_name + '.trie'
        run_cmd(
            [tool, "--enable-keyref", "-E", "-w", "-i", data, "-D", os.path.join(path, trie_path), "-N",
             self._keysemantic,
             "-S", "docid_setprops/" + trie_name, "-n", "2", "-f", "4096,128000"])
        trie_path += '.tar'
        run_cmd(['tar', '-C', path, '-cf', trie_path, '.'])
        return save_resource(self, path=trie_path, resource_type=BNO_QUERYDATA_TRIE)

    def is_production(self):
        return self.ctx[EnvTypeParam.name] == 'production'

    def is_mds_production(self):
        return self.ctx[MdsTypeParam.name] == 'production'

    def deploy_trie(self, host, kind, path):
        rsync_path = "{}::querydata{}/".format(host, '' if self.is_production() else '-beta')
        run_cmd(['tar', '-xf', path])
        run_cmd(['rsync', '-vr', '000', rsync_path])
        run_cmd(['rsync', '-vr', '001', rsync_path])
        run_cmd(['rsync', '-vr', self._adapter + '_' + kind + '.trie.tag', rsync_path])
        return True

    def upload_to_saas(self, kind, trie_resource):
        if getattr(self, 'arcadia_path', None) is None:
            self.arcadia_path = arc.Arc().mount_path(None, None, fetch_all=False)
        sys.path.append(os.path.join(self.arcadia_path._work_path, "quality", "functionality", "unstructured" + os.path.sep))
        from qdsaas_uploader.lib.uploader import Uploader
        from yt.wrapper import YtClient

        saas_namespace = self.ctx[SaasNamespace.name]
        saas_host = self.ctx[SaasHost.name]
        saas_root_folder = self.ctx[SaasRootFolder.name]
        saas_target = self.ctx[EnvTypeParam.name]
        vault_item = self.ctx[VaultTokenItemNameParam.name]
        yt_token = self.get_vault_data(vault_item)

        fields = saas_namespace.split(':', 1)
        if len(fields) != 2 or fields[-1] not in ('docid_setprops', 'categflag'):
            raise Exception('Expected namespace format: some_name:docid_setprops or some_name:categflag')

        uploader = Uploader(
            ns_name=saas_namespace % kind,
            saas_host=saas_host,
            root_folder=saas_root_folder,
            delivery_subfolder='deploy_' + saas_target,
            yt_client=YtClient(proxy='arnold', token=yt_token)
        )
        with open(trie_resource) as f:
            data = []
            for line in f:
                line = line.rstrip('\n\r')
                if not line:
                    continue
                key, snippet = line.split('\t', 1)
                if snippet.startswith('Snippet=') or snippet.startswith('Snippet:b='):
                    snippet = snippet.split('=', 1)[1]
                snippet = json.loads(snippet)
                data.append((key, snippet))
            uploader.upload_table(data)
        uploader.deliver_table()

    def on_execute_kind(self, kind, path):
        url2key, resource = self.build_keys(kind, path)
        if not url2key:
            return
        resource = self.save_trie_data(url2key, kind, path)
        if not self.ctx[UploadToSaas.name]:
            resource = self.build_trie(kind, resource.path)
        return [kind, resource.path]

    def on_execute(self):
        prepare()
        juggler_host = self.ctx[JugglerHostParam.name]
        juggler_client = juggler.JugglerClient(enabled=bool(juggler_host), host=juggler_host)
        juggler_client.ping('bno_recipes_build_start', juggler.STATUS_OK, 'A new task has been started')
        self.mds = Mds('snippets_images', 'pad_4x3_1_resize', not self.is_mds_production())
        vault_item = self.ctx[VaultTokenItemNameParam.name]
        yt_token = self.get_vault_data(vault_item)
        yt_tables = list(self.ctx[MrPathParam.name])
        pool = ThreadPool(cpu_count())
        with self.current_action('Extracting data'):
            resource = self.extract_data(yt_token, yt_tables, pool)

        with self.current_action('Prepare tries'):
            path = resource.path
            first = pool.apply_async(lambda: self.on_execute_kind('first', path))
            second = pool.apply_async(lambda: self.on_execute_kind('second', path))
            #third = pool.apply_async(lambda: self.on_execute_kind('third', path))
            first_1 = pool.apply_async(lambda: self.on_execute_kind('first_1', path))
            second_1 = pool.apply_async(lambda: self.on_execute_kind('second_1', path))
            #third_1 = pool.apply_async(lambda: self.on_execute_kind('third_1', path))

            trie_path = [first.get(), second.get(), first_1.get(), second_1.get()]
            for p in trie_path:
                if self.ctx[UploadToSaas.name]:
                    with self.current_action('Uploading data ' + p[0] + ' to SAAS'):
                        self.upload_to_saas(p[0], p[1])
                else:
                    success_count = 0
                    for host in self.get_shares(p[0]):
                        with self.current_action('Deploying data ' + p[0] + ' to ' + host):
                            if self.deploy_trie(host, p[0], p[1]):
                                success_count += 1
                        if success_count >= 2:
                            break
        juggler_client.ping('bno_recipes_build_finish', juggler.STATUS_OK, 'The task has done its job')
        if getattr(self, 'arcadia_path', None) is not None:
            if self.arcadia_path.mounted:
                self.arcadia_path.unmount()
                self.arcadia_path = None


__Task__ = BnoRecipesBuildTask
