# -*- coding: utf-8 -*-

import codecs
import logging
import os
import re
from collections import defaultdict

from sandbox.sandboxsdk import parameters
from sandbox.sandboxsdk import environments
from sandbox.sandboxsdk.svn import Arcadia

from sandbox.projects import resource_types
from sandbox.projects.common.mediasearch.trie import MediaReleaseTrie

from sandbox.projects.common.juggler import jclient


class AppendVideotopFastDeliveryUrlsParameter(parameters.SandboxBoolParameter):
    name = "use_videotop_fast_delivery_table"
    description = "Add urls from videotop fast delivery table"
    default_value = True
    group = 'Included Data'


class AppendVideotopForcedFastDeliveryUrlsParameter(parameters.SandboxBoolParameter):
    name = "use_videotop_forced_fast_delivery_table"
    description = "Add urls from forced top delivery table"
    default_value = True
    group = 'Included Data'


class AppendSerialsAndFilmsUrlsParameter(parameters.SandboxBoolParameter):
    name = "use_serials_table"
    description = "Add urls from serials tables"
    default_value = True
    group = 'Included Data'


class VideotopYTServerParameter(parameters.SandboxStringParameter):
    name = "static_factors_yt_server"
    description = "Static Factors YT server"
    required = True
    default_value = "banach"
    group = 'Fastdelivery'


class VideotopYTPathParameter(parameters.SandboxStringParameter):
    name = "yt_path_to_urls"
    description = "YT path to queries and urls"
    required = True
    default_value = "//home/videoindex/videotop/fastdelivery"
    group = 'Fastdelivery'


class VideotopForcedYTPathParameter(parameters.SandboxStringParameter):
    name = "yt_path_to_forced_urls"
    description = "YT path to forced urls table"
    required = True
    default_value = "//home/videoindex/videotop/forced"
    group = 'Fastdelivery'


class FilmsYTServerParameter(parameters.SandboxStringParameter):
    name = "films_yt_server"
    description = "Films YT server"
    required = True
    default_value = "hahn"
    group = 'Films'


class FilmsYTPathParameter(parameters.SandboxStringParameter):
    name = "films_yt_path"
    description = "YT path to films table"
    required = True
    default_value = "//home/dict/ontodb/video/top_ru_5k_from_toloka"
    group = 'Films'


class NewFilmsYTPathParameter(parameters.SandboxStringParameter):
    name = "new_films_yt_path"
    description = "YT path to new films table"
    required = True
    default_value = "//home/dict/ontodb/video/fresh_films_from_toloka/ru"
    group = 'Films'


class SeriesYTPathParameter(parameters.SandboxStringParameter):
    name = "series_yt_path"
    description = "YT path to series table"
    required = True
    default_value = "//home/videoquality/vuserdata2/external_data/external_factors/toloka_serial_queryurl_ok"
    group = 'Series'


def cut_scheme(url):
    if url.startswith("//"):
        return url[2:]
    if url.startswith("http://"):
        return url[7:]
    if url.startswith("https://"):
        return url[8:]
    return url


class VideoReleaseFastdeliveryTrie(MediaReleaseTrie):
    """
        Builds videotop.trie for Yandex.Video service
    """

    type = "VIDEO_RELEASE_FASTDELIVERY_TRIE"

    input_parameters = (
        AppendVideotopFastDeliveryUrlsParameter,
        AppendVideotopForcedFastDeliveryUrlsParameter,
        AppendSerialsAndFilmsUrlsParameter,

        VideotopYTServerParameter,
        VideotopYTPathParameter,
        VideotopForcedYTPathParameter,

        FilmsYTServerParameter,
        FilmsYTPathParameter,
        NewFilmsYTPathParameter,
        SeriesYTPathParameter,
    ) + MediaReleaseTrie.input_parameters

    release_subject = "video/middle/fastdelivery-data-{timestamp}"
    release_comment = "video fast fastdelivery.trie"
    release_resources = (
        resource_types.VIDEO_MIDDLESEARCH_FASTDELIVERY_TRIE,
    )

    environment = (
        environments.PipEnvironment("yandex-yt"),
        environments.PipEnvironment("yandex-yt-yson-bindings-skynet", version="0.3.32-0")
    )

    def _generate_trie_data(self):
        import yt.wrapper as yt

        yt.config['proxy']['url'] = self.ctx[VideotopYTServerParameter.name]
        yt.config['token'] = self.get_vault_data('VIDEODEV', 'yt_token')

        top_data = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))

        if self.ctx[AppendVideotopFastDeliveryUrlsParameter.name]:
            query_tables = yt.list(self.ctx[VideotopYTPathParameter.name])
            for qt in query_tables:
                query, tld = re.split('[;_]', qt.decode('utf-8'))[:2]
                for line in yt.read_table(self.ctx[VideotopYTPathParameter.name] + '/' + qt):
                    top_data[tld]["#top;{}".format(query)]['mbu'].add(cut_scheme(line['url']))

        if self.ctx[AppendVideotopForcedFastDeliveryUrlsParameter.name]:
            query_tables = yt.list(self.ctx[VideotopForcedYTPathParameter.name])
            for qt in query_tables:
                query, tld, method = re.split('[;_]', qt.decode('utf-8'))[:3]
                for line in yt.read_table(self.ctx[VideotopForcedYTPathParameter.name] + '/' + qt):
                    if 'url' not in line:
                        raise RuntimeError("Unexpected format of forced urls table, it has to have either `url' or `subkey' column containing url")
                    top_data[tld]["#top_forced;{}".format(query)][method].add(cut_scheme(line['url']))

        if self.ctx[AppendSerialsAndFilmsUrlsParameter.name]:
            self._extract_svn_data('arcadia:/arc/trunk/data/extsearch/video/quality/top/serials.lst', top_data)
            self._extract_svn_data('arcadia:/arc/trunk/data/extsearch/video/quality/top/vnorm.lst', top_data)

#            self._extract_films(top_data, self.ctx[FilmsYTPathParameter.name])
#            self._extract_films(top_data, self.ctx[NewFilmsYTPathParameter.name])

#            self._extract_series(top_data, self.ctx[SeriesYTPathParameter.name])

        for tld, queries in top_data.iteritems():
            for query, data in queries.iteritems():
                trie_data = dict()
                for method, urls in data.iteritems():
                    key = None
                    if method == 'u':
                        key = 'urls'
                    elif method == 'mbu':
                        key = 'mbu_urls'
                    elif method == 'relev':
                        key = 'relev'
                    else:
                        logging.error('bad method: %s' % method)
                        continue
                    trie_data[key] = list(urls)
                yield (tld, query, trie_data)

    def _extract_svn_data(self, svn_url, data):
        exported_path = self.abs_path(os.path.basename(svn_url))
        Arcadia.export(svn_url, exported_path)
        with codecs.open(exported_path, 'r', encoding='utf-8') as in_file:
            for line in in_file:
                query, method, exp, tld, url = line.strip().split('\t')
                data[tld][query+';'+exp][method].add(cut_scheme(url))

    def _extract_films(self, top_data, yt_films_path):
        import yt.wrapper as yt

        if not yt_films_path:
            return

        yt.config['proxy']['url'] = self.ctx[FilmsYTServerParameter.name]
        yt.config['token'] = self.get_vault_data('VIDEODEV', 'yt_token')

        ok_data = defaultdict(set)
        tolerable_data = defaultdict(set)

        for line in yt.read_table(yt_films_path):
            result = line['result']
            if result not in ['ok', 'tolerable']:
                continue

            prob = line['probability']
            if float(prob) < 0.8:
                continue

            query = line['searchquery'].decode('utf-8').lower()
            url = line['url']

            query_data = ok_data if result == 'ok' else tolerable_data
            query_data[query].add(cut_scheme(url))

        trie_query = 'prod'
        for query, urls in ok_data.iteritems():
            for url in list(urls)[:5]:
                top_data['ru'][query + ';' + trie_query]['u'].add(cut_scheme(url))

        for query, urls in tolerable_data.iteritems():
            for url in list(urls)[:5]:
                top_data['ru'][query + ';' + trie_query]['mbu'].add(cut_scheme(url))

    def _extract_series(self, top_data, yt_series_path):
        import yt.wrapper as yt

        if not yt_series_path:
            return

        query_data = defaultdict(set)

        for line in yt.read_table(yt_series_path):
            result = line['result']
            if result not in ['REL']:
                continue

            prob = line['probability']
            if float(prob) < 0.8:
                continue

            query = line['title'].decode('utf-8').lower() + "." + str(line['season']) + '.' + str(line['episode'])
            url = line['url']

            query_data[query].add(cut_scheme(url))

        trie_query = 'prod'
        for query, urls in query_data.iteritems():
            for url in list(urls)[:15]:
                top_data['ru'][query + ';' + trie_query]['u'].add(cut_scheme(url))

    def get_nanny_oauth_token(self):
        return self.get_vault_data('VIDEO-ROBOT', 'robot-video-crawl-nanny-oauth')

    def _monitor_ban(self):
        jclient.send_events_to_juggler('video_quality.sandbox', self.type, 'OK', 'bans released sucessfully')


__Task__ = VideoReleaseFastdeliveryTrie
