import datetime
import json
import logging
import os
import time
import sys
import urlparse

import sandbox.sandboxsdk as sdk

from sandbox.sandboxsdk import errors
from sandbox.sandboxsdk import parameters
from sandbox.sandboxsdk import process

from sandbox.projects.common.ReleasedResourceConsumer import ReleasedResourceConsumer

from sandbox.projects import resource_types
from sandbox.projects.common.mediasearch import ban as mediaban
from sandbox.projects.common.nanny import auto_deploy

from sandbox.sandboxsdk.svn import Arcadia


class YtPythonParameter(parameters.ResourceSelector):
    name = 'yt_python_resource_id'
    description = 'YT Python resource'
    resource_type = resource_types.VIDEO_YT_PYTHON_EXECUTABLE


class DnsBanYTServerParameter(parameters.SandboxStringParameter):
    name = "dnsban_yt_server"
    description = "DNS ban YT server"
    required = True
    default_value = "hahn"


class DeadHostBanYTServerParameter(parameters.SandboxStringParameter):
    name = "dead_host_ban_yt_server"
    description = "Dead hosts ban YT server"
    required = True
    default_value = "hahn"


class VideobanYTServerParameter(parameters.SandboxStringParameter):
    name = "videoban_yt_server"
    description = "Video ban YT server"
    required = True
    default_value = "banach"


class GpmBanYTServerParameter(parameters.SandboxStringParameter):
    name = "gpm_ban_yt_server"
    description = "GPM ban YT server"
    required = True
    default_value = "banach"


class GrimholdTable(parameters.SandboxStringParameter):
    name = "grimhold_table"
    description = "YT grimhold table"
    required = False
    default_value = "//home/antispam/export/video/grimhold_bans"


class VideotopBanSvnPath(parameters.SandboxStringParameter):
    name = "videotop_ban_svn_path"
    description = "Videotop ban svn path"
    required = False
    default_value = "arcadia:/arc/trunk/arcadia/yweb/webscripts/video/index/config/videotop.manban"


class VideohubBanSvnPath(parameters.SandboxStringParameter):
    name = "videohub_ban_svn_path"
    description = "Videohub ban svn path"
    required = False
    default_value = "arcadia:/arc/trunk/arcadia/yweb/webscripts/video/index/config/videohub.manban"


class PornotopBanTable(parameters.SandboxStringParameter):
    name = "pornotop_ban_table"
    description = "YT pornotop ban table"
    required = False
    default_value = "//home/videoquality/vuserdata2/external_data/external_factors/toloka_porno_doc_ban"


class GpmBanSvnPath(parameters.SandboxStringParameter):
    name = "gpm_ban_svn_path"
    description = "Gpm ban svn path"
    required = False
    default_value = "arcadia:/arc/trunk/arcadia/yweb/webscripts/video/index/config/gpm.manban"


class GpmBanYtPath(parameters.SandboxStringParameter):
    name = "gpm_ban_yt_path"
    description = "Gpm ban yt path"
    required = False
    default_value = "//home/videoindex/deletes/video.gpm.ban"


class PlayerFastBanSvnPath(parameters.SandboxStringParameter):
    name = "player_fast_ban_svn_path"
    description = "Player fast ban svn path"
    required = False
    default_value = "arcadia:/arc/trunk/arcadia/yweb/webscripts/video/player/fastban.lst"


class YtHostBanTable(parameters.SandboxStringParameter):
    name = "yt_host_ban_table"
    description = "YT host ban table"
    required = True
    default_value = "//home/rearrange/dns_banned/production/erfdata/dns_banned_reg.txt"


class YtVideoHostBanTable(parameters.SandboxStringParameter):
    name = "yt_video_host_ban_table"
    description = "YT video host ban table"
    required = True
    default_value = "//home/videoindex/deletes/video.hosts.tur.dns"


class HostDnsBanSvnPath(parameters.SandboxStringParameter):
    name = "host_dns_ban_svn_path"
    description = "Host DNS ban svn path"
    required = False
    default_value = "arcadia:/arc/trunk/arcadia/yweb/webscripts/video/videoban/config/host.dns.manban"


class YtPlayerHostBanTable(parameters.SandboxStringParameter):
    name = "yt_player_host_ban_table"
    description = "YT player host ban table"
    required = True
    default_value = "//home/videoindex/deletes/hostdnsban"


class YtUserAgentBanTable(parameters.SandboxStringParameter):
    name = "yt_user_agent_ban_table"
    description = "YT user agent ban table"
    required = True
    default_value = "//home/videoindex/deletes/useragentban"


class YtRegionBanTable(parameters.SandboxStringParameter):
    name = "yt_region_ban_table"
    description = "YT user region ban table"
    required = True
    default_value = "//home/videoindex/deletes/regionban"


class YtDeadHostBanTable(parameters.SandboxStringParameter):
    name = "yt_dead_host_ban_table"
    description = "YT dead host ban table"
    required = True
    default_value = "//home/videoindex/deletes/factors/fast/dead_hosts"


class BuildTestResourcesParameter(parameters.SandboxBoolParameter):
    name = 'build_test_resources'
    description = 'Build test resources'
    default_value = False


class ForceCreateUnchangedParameter(parameters.SandboxBoolParameter):
    name = "force_unchanged"
    description = "Force creation of unchanged resources"
    default_value = False


class YtPortionsProxyParameter(parameters.SandboxStringParameter):
    name = 'portions_server'
    description = 'YT Proxy for dumping portions'
    required = True
    default_value = 'arnold'


class YtHostPortionsDir(parameters.SandboxStringParameter):
    name = 'portions_host_dir'
    description = 'YT Proxy host portions dir'
    required = True
    default_value = '//home/videoindex/static_factors_new_format/host_factors/portions.normalized'


class YtVhServiceFlagsTable(parameters.SandboxStringParameter):
    name = 'vh_service_flags_table'
    description = 'YT VH service flags table'
    required = True
    default_value = '//home/videoindex/vhs/dups-trie/prevdata/vh_attrs.service_flags'


def is_image_url(url):
    url_lower = url.lower()
    return url_lower.endswith('.jpg') or url_lower.endswith('.jpeg') or url_lower.endswith('.gif')


def get_host(url):
    if url.startswith('https://'):
        url = url[8:]
    elif url.startswith('http://'):
        url = url[7:]
    ind = url.find('/')
    return url if ind < 0 else url[:ind]


def format_dns_ban(line):
    host, region, _ = line.strip().split('\t', 2)
    return "#query\t%s\t*\t%s\turl\tdnsban\n" % (host, region)


def format_region_ban(line):
    host, region = line.strip().split('\t', 2)
    return "#query\t%s\t*\t%s\turl\tcommon\n" % (host, region)


def format_manual_player_ban(line):
    url, tld, region = line.strip().split('\t', 3)
    return "%s\t%s\turl\t%s\tplayerban\n" % (url, tld, region)


def format_videotop_ban(line):
    url, tld = line.strip().split('\t', 2)
    return "%s\t%s\tdoc\t0\tvideotop\n" % (url, tld)


def format_videotop_manban(line):
    fields = line.strip().split('\t')
    url = fields[0]
    tld = fields[1] if len(fields) >= 2 else '*'
    return "%s\t%s\tdoc\t0\tvideotop\n" % (url, tld)


def format_videohub_manban(line):
    fields = line.strip().split('\t')
    url = fields[0]
    tld = fields[1] if len(fields) >= 2 else '*'
    return "%s\t%s\tdoc\t0\tvideohub\n" % (url, tld)


def format_gpm_manban(line):
    fields = line.strip().split('\t')
    url = fields[0]
    tld = fields[1] if len(fields) >= 2 else '*'
    return "%s\t%s\tdoc\t0\tcommon\n" % (url, tld)


def format_gpm_yt_ban(line):
    # logging.debug("gpm ban: {}".format(line.strip()))
    data = json.loads(line)
    url = data['key'].strip()
    tld = data['value'].strip()
    return "%s\t%s\tdoc\t0\tcommon\n" % (url, tld)


def format_pornotop_ban(line):
    data = json.loads(line)
    url = data["url"]
    tld = "*"
    return "%s\t%s\tdoc\t0\tpornotop\n" % (url, tld)


def format_user_agent_ban(line):
    host, user_agent = line.strip().split('\t', 2)
    return "#query\t%s\t*\t0\turl\tuAgentBan=%s\n" % (host, user_agent)


def format_dead_host_ban(line):
    host, tld, region, _ = line.strip().split('\t', 3)
    return "#query\t%s\t%s\t%s\turl\tdnsban\n" % (host, tld, region)


def format_vh_service_flags_ban(line):
    url, flags = line.strip().split('\t', 2)
    return "#query\t%s\t*\t0\turl\tservice_flags=%s\n" % (url, flags)


# Target;Domain;Region;DocType;BanType;Oblivion;Link
class GrimholdYtLine():
    def __init__(self, line):
        self.url, self.tld, self.region, self.bantype, self.sourceban, ban_stub_url1, ban_stub_url2 = line.rstrip("\n").split("\t")
        self.ban_stub_url = ban_stub_url1 if ban_stub_url1 else ban_stub_url2
        if self.sourceban == "common":
            self.sourceban = "manban"
        if self.region == "*":
            self.region = "0"


def format_grimhold_yt_ban(line):
    ban = GrimholdYtLine(line)
    if is_image_url(ban.url) or get_host(ban.url).endswith('.yandex.ru') or ban.sourceban == 'memorandum':
        return None
    # these urls are moved into hash-file
    if ban.tld == '*' and ban.region == '0' and ban.bantype == 'url' and ban.sourceban == 'manban' and not ban.ban_stub_url:
        return None
    return "%s\t%s\t%s\t%s\t%s=%s\n" % (ban.url, ban.tld, ban.bantype, ban.region, ban.sourceban, ban.ban_stub_url)


def format_grimhold_rkn_host_to_portions(line):
    tokens = line.rstrip('\n').split('\t')

    url = tokens[0].rstrip('/')
    region = tokens[3]
    info = tokens[4]

    parsed_url = urlparse.urlparse(url)
    if parsed_url.path:  # not host
        return None
    if not ('rkn=' in info) or region != '225':
        return None
    return parsed_url.hostname + '\t' + 'IsFilterBanned@' + region + '=1' + '\n'


def format_trie_input(line):
    p = line.strip().split('\t')
    url_l = p[1].lower()
    if is_image_url(url_l):
        return ''
    # url,tld,reg -> tld,reg,url
    tmp = p[1]
    p[1] = p[2]
    p[2] = p[3]
    p[3] = tmp
    return '\t'.join(p) + '\n'


class VideoReleaseVideoban(auto_deploy.AutoNannyDeployTask, mediaban.VideoBaseReleaseBanTask, ReleasedResourceConsumer):
    """
        Builds videoban.trie for Yandex.Video service
    """

    type = "VIDEO_RELEASE_VIDEOBAN"

    environment = (sdk.environments.PipEnvironment("yandex-yt"),)

    input_parameters = (
        YtPythonParameter,
        DnsBanYTServerParameter,
        VideobanYTServerParameter,
        GpmBanYTServerParameter,
        GrimholdTable,
        YtHostBanTable,
        HostDnsBanSvnPath,
        YtVideoHostBanTable,
        YtPlayerHostBanTable,
        PlayerFastBanSvnPath,
        YtUserAgentBanTable,
        YtRegionBanTable,
        DeadHostBanYTServerParameter,
        YtDeadHostBanTable,
        VideotopBanSvnPath,
        VideohubBanSvnPath,
        PornotopBanTable,
        GpmBanSvnPath,
        GpmBanYtPath,
        BuildTestResourcesParameter,
        ForceCreateUnchangedParameter,
        YtPortionsProxyParameter,
        YtHostPortionsDir,
        YtVhServiceFlagsTable
    ) + mediaban.VideoBaseReleaseBanTask.input_parameters + ReleasedResourceConsumer.input_parameters

    release_subject = "video/middle/videoban-data-{timestamp}"
    release_comment = "video fast videoban.trie"
    release_resources = (
        resource_types.VIDEO_MIDDLESEARCH_VIDEOBAN_VERSION,
        resource_types.VIDEO_MIDDLESEARCH_VIDEOBAN_TRIE,
    )

    def get_stable_services(self):
        return [self._SERVICE_ID]

    def get_nanny_oauth_token(self):
        return self.get_vault_data('VIDEO-ROBOT', 'robot-video-crawl-nanny-oauth')

    def _build_ban(self):
        # url_ban.lst
        url_ban_path = self.abs_path(resource_types.VIDEO_MIDDLESEARCH_URL_BAN.basename)

        svn_player_ban_arcadia_url = self.ctx[PlayerFastBanSvnPath.name]
        svn_player_ban_path = url_ban_path + ".player.svn"
        Arcadia.export(svn_player_ban_arcadia_url, svn_player_ban_path)
        self._format_file(svn_player_ban_path, url_ban_path, format_func=format_manual_player_ban)

        # manual videotop ban
        svn_videotop_ban_path = url_ban_path + ".videotop.svn"
        svn_videotop_ban_arcadia_url = self.ctx[VideotopBanSvnPath.name]
        Arcadia.export(svn_videotop_ban_arcadia_url, svn_videotop_ban_path)
        self._format_file(svn_videotop_ban_path, url_ban_path, format_func=format_videotop_manban, append=True)

        # manual videohub ban
        svn_videohub_ban_path = url_ban_path + ".videohub.svn"
        svn_videohub_ban_arcadia_url = self.ctx[VideohubBanSvnPath.name]
        Arcadia.export(svn_videohub_ban_arcadia_url, svn_videohub_ban_path)
        self._format_file(svn_videohub_ban_path, url_ban_path, format_func=format_videohub_manban, append=True)

        # pornotop
        if self.ctx[PornotopBanTable.name]:
            self._read_yt_table(self.ctx[PornotopBanTable.name], url_ban_path, format_func=format_pornotop_ban, server=self.ctx[VideobanYTServerParameter.name], append=True, format="json")

        # manual gpm ban
        if self.ctx[GpmBanSvnPath.name]:
            svn_gpm_ban_path = url_ban_path + ".gpm.svn"
            svn_gpm_ban_arcadia_url = self.ctx[GpmBanSvnPath.name]
            Arcadia.export(svn_gpm_ban_arcadia_url, svn_gpm_ban_path)
            self._format_file(svn_gpm_ban_path, url_ban_path, format_func=format_gpm_manban, append=True)

        if self.ctx[GpmBanYtPath.name]:
            self._read_yt_table(self.ctx[GpmBanYtPath.name], url_ban_path, format_func=format_gpm_yt_ban, server=self.ctx[GpmBanYTServerParameter.name], append=True, format="json")

        has_changes = self._update_resource(resource_types.VIDEO_MIDDLESEARCH_URL_BAN, path=url_ban_path)

        # host ban
        host_ban_path = self.abs_path(resource_types.VIDEO_MIDDLESEARCH_HOST_BAN.basename)
        svn_host_ban_path = host_ban_path + ".svn"
        svn_host_ban_arcadia_url = self.ctx[HostDnsBanSvnPath.name]
        Arcadia.export(svn_host_ban_arcadia_url, svn_host_ban_path)

        self._format_file(svn_host_ban_path, host_ban_path, format_func=format_dns_ban)
        self._read_yt_table(self.ctx[YtHostBanTable.name], host_ban_path, server=self.ctx[DnsBanYTServerParameter.name], append=True, format_func=format_dns_ban)
        self._read_yt_table(self.ctx[YtVideoHostBanTable.name], host_ban_path, server=self.ctx[VideobanYTServerParameter.name], append=True, format_func=format_dns_ban)
        self._read_yt_table(self.ctx[YtPlayerHostBanTable.name], host_ban_path, server=self.ctx[VideobanYTServerParameter.name], append=True, format_func=format_region_ban)
        self._read_yt_table(self.ctx[YtDeadHostBanTable.name], host_ban_path, server=self.ctx[DeadHostBanYTServerParameter.name], append=True, format_func=format_dead_host_ban)
        self._read_yt_table(self.ctx[YtRegionBanTable.name], host_ban_path, server=self.ctx[VideobanYTServerParameter.name], append=True, format_func=format_region_ban)

        has_changes = self._update_resource(resource_types.VIDEO_MIDDLESEARCH_HOST_BAN, path=host_ban_path) or has_changes

        # user agent ban
        user_agent_ban_path = self.abs_path(resource_types.VIDEO_MIDDLESEARCH_USER_AGENT_BAN.basename)
        self._read_yt_table(self.ctx[YtUserAgentBanTable.name], user_agent_ban_path, server=self.ctx[VideobanYTServerParameter.name], format_func=format_user_agent_ban)

        has_changes = self._update_resource(resource_types.VIDEO_MIDDLESEARCH_USER_AGENT_BAN, path=user_agent_ban_path) or has_changes

        # grimhold
        tmp_grimhold_path = self.abs_path('grimhold.tmp')
        self._read_yt_table(self.ctx[GrimholdTable.name], tmp_grimhold_path, server=self.ctx[VideobanYTServerParameter.name], append=False,
                            format_func=format_grimhold_yt_ban,
                            format='<columns=[Target;Domain;Region;DocType;BanType;Oblivion;Link]>schemaful_dsv',
                            timeout=2000)

        has_changes = self._update_resource(resource_types.VIDEO_MIDDLESEARCH_ANTIPIRATE_URLS_BAN_GRIMHOLD, path=tmp_grimhold_path) or has_changes

        self.make_rkn_portions(tmp_grimhold_path)

        if not has_changes and not self.ctx[ForceCreateUnchangedParameter.name]:
            return 0

        url2fastban_tool = self.last_released_resource(resource_types.VIDEO_URL2FASTBAN_EXECUTABLE)
        prepared_url_ban_path = url_ban_path + ".pre"
        process.run_process([
            url2fastban_tool,
            "prepareUrls",
            "-i", url_ban_path,
            "-o", prepared_url_ban_path], outputs_to_one_file=False, log_prefix="url2fastban")

        # prepare grimhold
        prepared_grimhold_path = tmp_grimhold_path + ".pre"
        process.run_process([
            url2fastban_tool,
            "prepareUrls",
            "-i", tmp_grimhold_path,
            "-o", prepared_grimhold_path], outputs_to_one_file=False, log_prefix="url2fastban")

        # prepare query.txt
        filenames = [prepared_grimhold_path, prepared_url_ban_path, host_ban_path, user_agent_ban_path]
        query_txt_path = self.abs_path(resource_types.VIDEO_MIDDLESEARCH_VIDEOBAN_RAW_DATA.basename)
        with open(query_txt_path, 'w') as outfile:
            lines = []
            for fname in filenames:
                with open(fname) as infile:
                    lines.extend([line for line in infile])
            lines.sort()
            url, tld, reg, bantype, info = "", "", "", "", ""
            for line in lines:
                fields = line.split("\t")
                if fields[1] == url and fields[2] == tld and fields[3] == reg:
                    if bantype != "doc":
                        bantype = fields[4]
                else:
                    if url:
                        outfile.write("#query\t%s\t%s\t%s\t%s\t%s" % (url, tld, reg, bantype, info))
                    _, url, tld, reg, bantype, info = fields[:6]
            outfile.write("#query\t%s\t%s\t%s\t%s\t%s" % (url, tld, reg, bantype, info))

        # optimization test
        query_txt_path_upd = query_txt_path + '.upd'
        self._format_file(query_txt_path, query_txt_path_upd, format_func=format_trie_input)
        self._update_resource(resource_types.VIDEO_MIDDLESEARCH_VIDEOBAN_RAW_DATA, path=query_txt_path_upd)

        # videoban.trie
        indexer_tool = self.last_released_resource(resource_types.VIDEO_QUERYDATAINDEXER_EXECUTABLE)
        videoban_trie_path = self.abs_path(resource_types.VIDEO_MIDDLESEARCH_VIDEOBAN_TRIE.basename)

        process.run_process([
            indexer_tool,
            "-S", "localurlsban",
            "-N", "tld,ipregregion,exacturl",
            "-i", query_txt_path_upd,
            "-o", videoban_trie_path], outputs_to_one_file=False, log_prefix="indexer")

        viewer_tool = self._tool(resource_types.VIDEO_QUERYDATAVIEWER_EXECUTABLE)
        process.run_process([
            viewer_tool,
            "-H",
            "-i", videoban_trie_path], outputs_to_one_file=False, log_prefix="viewer")

        self.create_resource(
            self.descr,
            videoban_trie_path,
            resource_types.VIDEO_MIDDLESEARCH_VIDEOBAN_TRIE,
            attributes={'video_testenv_autoupdate_videoban_trie': 'yes'})

        # version
        version_path = self.abs_path(resource_types.VIDEO_MIDDLESEARCH_VIDEOBAN_VERSION.basename)
        with open(version_path, "w") as version_file:
            version_file.write(str(int(time.time())))
        self.create_resource(self.descr, version_path, resource_types.VIDEO_MIDDLESEARCH_VIDEOBAN_VERSION)

        return os.stat(videoban_trie_path).st_size

    def _read_yt_table(self, table_path, dst_path, format_func, server, append=False, format="yamr", timeout=600):
        mapreduce_read = self._yt("--proxy", server, "read", "--format", format, table_path, timeout=timeout)
        if mapreduce_read.wait():
            raise errors.SandboxTaskFailureError("Failed to read %s" % (table_path))
        self._format_file(mapreduce_read.stdout_path, dst_path, format_func=format_func, append=append)

    def make_rkn_portions(self, grimhold_path):
        import yt.wrapper as yt
        try:
            grimhold_rkn_hosts_file = grimhold_path + '.rkn_hosts'
            self._format_file(grimhold_path, grimhold_rkn_hosts_file, format_func=format_grimhold_rkn_host_to_portions)
            rkn_hosts_portion_tablepath = self.ctx[YtHostPortionsDir.name] + '/grimhold_rkn_banned_' + datetime.datetime.now().strftime("%s") + '.htab'

            # TODO: rewrite this shit after closing VIDEOPOISK-10805
            yt.config['proxy']['url'] = self.ctx[YtPortionsProxyParameter.name]
            yt.config['token'] = self.get_vault_data('VIDEODEV', 'yt_token')
            with yt.TempTable() as tmp_table, open(grimhold_rkn_hosts_file, 'r') as fin:
                file_content = fin.read()
                yt.write_table(tmp_table, file_content, format='yamr', raw=True)
                sys.stderr.write('Rkn hosts table:' + tmp_table)
                self._yt('--proxy', self.ctx[YtPortionsProxyParameter.name], 'move', tmp_table, rkn_hosts_portion_tablepath)
        except Exception:
            sys.stderr.write('Making banned hosts portions error')
            return  # TODO: send juggler notification

    def _yt(self, cmd, *args, **kwargs):
        yt_tool = self.last_released_resource(resource_types.VIDEO_YT_PYTHON_EXECUTABLE, YtPythonParameter.name)
        yt_args = (
            yt_tool,
        )

        return process.run_process(
            yt_args + (cmd,) + args,
            environment={"YT_TOKEN": self.get_vault_data('VIDEODEV', 'yt_token')},
            outputs_to_one_file=False,
            log_prefix="yt.{}".format(cmd),
            timeout=kwargs.get('timeout', 600),
            wait=False
        )

    def _update_service(self, service_id):
        logging.info("_update_service: {}".format(str(service_id)))
        auto_deploy.AutoNannyDeployTask._update_service(self, service_id)

#    def _can_release(self, build_task):
#        if self.ctx[BuildTestResourcesParameter.name]:
#            return False
#        return mediaban.VideoBaseReleaseBanTask._can_release(self, build_task)


__Task__ = VideoReleaseVideoban
