import logging
import codecs
import json
import urllib2
import xml.etree.ElementTree
from shutil import copyfile
import urlparse

import sandbox.sandboxsdk.parameters as sdk_parameters
import sandbox.common.types.misc as ctm
import sandbox.common.types.client as ctc
from sandbox.projects.common import utils
from sandbox.projects.common.bno.params import EnvTypeParam
from sandbox.projects.common.bno.resources import save_resource
from sandbox.projects.common.bno.utils import run_cmd
from sandbox.projects.resource_types import BNO_GEMINICL_EXECUTABLE, BNO_NEWS_HOSTS, PLAIN_TEXT
from sandbox.sandboxsdk.task import SandboxTask
from sandbox.sandboxsdk import environments


class MrPathParam(sdk_parameters.SandboxStringParameter):
    name = 'mr_path'
    description = 'MapReduce tables path'
    default_value = 'hahn://home/freshness/bno/host_regions'
    required = True


class VaultTokenItemNameParam(sdk_parameters.SandboxStringParameter):
    name = 'vault_item'
    description = 'Vault item name for mr token'
    default_value = 'mrtoken'
    required = True


class NewsSourcesParam(sdk_parameters.SandboxUrlParameter):
    name = 'news_sources'
    description = 'News sources export url'
    default_value = 'https://news.yandex.ru/export/partners'


class BnoNewsHostsBuildTask(SandboxTask):
    type = 'BNO_NEWS_HOSTS_BUILD'
    dns = ctm.DnsType.DNS64
    client_tags = ctc.Tag.Group.LINUX
    input_parameters = [EnvTypeParam, NewsSourcesParam, MrPathParam, VaultTokenItemNameParam]
    environment = (
        environments.PipEnvironment("yandex-yt"),
    )

    def __init__(self, task_id=0):
        SandboxTask.__init__(self, task_id)

    def on_execute(self):
        vault_item = self.ctx[VaultTokenItemNameParam.name]
        yt_token = self.get_vault_data(vault_item)
        yt_table = self.ctx[MrPathParam.name]
        data = list(self.yt_read_table(yt_token, yt_table))
        federal_hosts = set(unicode(item['key'], 'utf-8') for item in data if self.is_federal(unicode(item['value'], 'utf-8')))
        logging.info("Federal hosts {} out of {}".format(len(federal_hosts), len(data)))
        with self.current_action('Download news urls'):
            urls, resource = self.get_source_hosts()
        with self.current_action('Mobilize news urls'):
            mobs, _ = self.mobilize_urls(resource.file_name)
        with self.current_action('Join urls'):
            joined = self.append_urls(resource.file_name, mobs)
        with self.current_action('Cannonize urls'):
            canonized, _ = self.canonize_urls(joined.file_name)
        with self.current_action('Fix moble urls'):
            self.fix_urls(mobs, canonized, federal_hosts)

    @staticmethod
    def is_federal(regions_string):
        regions = set([int(i) for i in regions_string.split(';')])
        return 213 in regions and 2 in regions

    @staticmethod
    def yt_read_table(token, table):
        from yt.wrapper import YtClient
        parts = table.split(':')
        client = YtClient(parts[0], token)
        return client.read_table(parts[1], format='dsv', raw=False)

    @staticmethod
    def fix_url(item, mobs_dict, federal_hosts):
        canonical = item[1]
        desk = mobs_dict.get(item[1])
        if desk:
            canonical = desk
        return item[0], canonical, item[2], '1' if urlparse.urlparse(canonical).netloc in federal_hosts else '0'

    def fix_urls(self, mobs, canonized, federal_hosts):
        mobs_dict = {item[2]: item[1] for item in mobs}
        data = (self.fix_url(item, mobs_dict, federal_hosts) for item in canonized)
        return data, data and save_resource(self, data, self.path('news.hosts.txt'), BNO_NEWS_HOSTS) or None

    def append_urls(self, filename, data):
        path = self.path("joined.news.hosts.txt")
        copyfile(filename, path)
        with codecs.open(path, 'a', encoding="utf-8") as f:
            for item in data:
                f.write(item[2])
                f.write('\n')

        return save_resource(self, path=path)

    def mobilize_urls(self, file_name):
        tool = utils.sync_last_stable_resource(BNO_GEMINICL_EXECUTABLE, arch='linux')
        path = self.path('mobilize.json')
        run_cmd([tool, "--format", "json", "--type", "desktop2mobile", "-f", file_name, ">", path])
        save_resource(self, path=path)
        with codecs.open(path, 'r', encoding="utf-8") as f:
            data = (json.loads(item) for item in f)
            data = (item['Response'] for item in data if 'Response' in item and 'Error' not in item['Response'])
            data = ((item['OriginalUrl'], item['CanonizedUrl'], item['MainUrl'][0]) for item in data)
            data = [item for item in data if item[1] != item[2]]
        return data, data and save_resource(self, data, self.path('m.news.hosts.txt'), PLAIN_TEXT) or None

    def canonize_urls(self, file_name):
        tool = utils.sync_last_stable_resource(BNO_GEMINICL_EXECUTABLE, arch='linux')
        path = self.path('gemini.json')
        run_cmd([tool, "--format", "json", "--type", "search_doc_id", "-f", file_name, ">", path])
        save_resource(self, path=path)
        with codecs.open(path, 'r', encoding="utf-8") as f:
            data = (json.loads(item) for item in f)
            data = (item['Response'] for item in data if 'Response' in item and 'Error' not in item['Response'])
            data = ((item['OriginalUrl'], item['CanonizedUrl'], item['MainUrl'][0]) for item in data)
            data = list(data)
        return data, data and save_resource(self, data, self.path('canonized.news.hosts.txt'), PLAIN_TEXT) or None

    def get_source_hosts(self):
        data = urllib2.urlopen(self.ctx['news_sources']).read()
        root = xml.etree.ElementTree.fromstring(data)
        urls = [e.text for e in root.iter('site_url')]
        return urls, save_resource(self, urls, 'news.txt')


__Task__ = BnoNewsHostsBuildTask
