# -*- coding: utf-8 -*-

import logging
import yt.wrapper as yt
import random
from datetime import datetime
from string import ascii_letters, digits
import re
from time import time
import os

from bannerland.yql.tools import do_yql, get_client
from bm.yt_tools import set_upload_time
import irt.broadmatching.common_options
from irt.monitoring.solomon.sensors import SolomonAgentSensorsClient
from library.charset.doccodes import doccodes

logging.basicConfig(level=logging.INFO, format="%(asctime)s\t[%(process)d]\t%(levelname)s\t%(message)s")
logger = logging.getLogger(__name__)


# Pragma for YQL requests
PRAGMA_STR = """
    PRAGMA yt.InferSchema;
    PRAGMA yt.MaxRowWeight = "134217728"; -- bytes in 128 Mb
    """

# Name of YT & YQL pool
POOL = "bannerland-data"

# Path to input dyn-perf-domains
INPUT_TABLE = "//home/bannerland/data/dse/preparing/dyn-perf-domains"

# Path to result table kwyt_banners
RESULT_TABLE = "//home/bannerland/data/dse/preparing/kwyt_banners"

# Folder for saving shard results and the table of hosts
MAIN_FOLDER = "//home/bannerland/data/dse/preparing/dpd-kwyt"

# Table of hosts' combinations (from dyn-perf-domains)
HOST_COMBINATIONS = MAIN_FOLDER + "/dpd_res"

# Count of shards in KWYT
SHARDS_COUNT = 32

# Delta-time between this script launches (in seconds)
LAUNCH_TIME_DELTA = 60 * 60 * 24 * 2  # 2 days

# Object in YT (cluster Locke) for getting distributed lock
LOCK_YT_OBJECT = "//home/bannerland/locks/kwyt_banners"

# Object in YT (cluster Locke) for getting distributed attributes
ATTR_YT_OBJECT = "//home/bannerland/attributes/kwyt_banners"

# Max count of the timeout for YQL-requests
TIMEOUT_HOUR = 24 * 7

# Dict of the most popular ecnodings in KWYT for domains from 'dyn-perf-domains'
DOCCODES_DICT = {
    doccodes.CODES_UTF8: "utf-8",  # 13
    doccodes.CODES_WIN: "windows-1251",  # 0
    doccodes.CODES_ASCII: "windows-1252",  # 5
    doccodes.CODES_KOI8: "koi8_u",  # 1
    doccodes.CODES_WINDOWS_1254: "windows-1254",  # 30
    doccodes.CODES_MAIN: "iso8859_5",  # 4
    doccodes.CODES_WINDOWS_1255: "windows-1255",  # 31
    doccodes.CODES_WINDOWS_1257: "windows-1257",  # 33
    doccodes.CODES_WIN_EAST: "windows-1250",  # 7
    doccodes.CODES_ISO_EAST: "iso8859_2"  # 8
}


# Map class. For getting pairs <URL> - <Title>
class MapClass:

    def __init__(self):
        self.reg_pattern = re.compile(r"<(?:title|Title|TITLE)>((?s).*?)</(?:title|Title|TITLE)>")
        self.cur_time = time()

    # Return titles from http-body
    def get_title(self, html_body, charset):
        res_text = ""

        title_search = self.reg_pattern.search(html_body)
        if title_search is not None:
            res_text = title_search.group(1)

        try:
            result = res_text.decode(DOCCODES_DICT[charset]).encode('utf-8')
        except Exception:
            result = ""

        return result

    # Mapper function
    def __call__(self, row):

        is_last = row.get("IsLast")
        if (is_last is None) or (is_last is False):
            return

        http_code = row.get("HttpCode")
        if (http_code is None) or (http_code != 200):
            return

        mime_type = row.get("MimeType")
        if (mime_type is None) or (mime_type != 2):
            return

        charset = row.get("Charset")
        if charset not in DOCCODES_DICT:
            return

        last_access = row.get("LastAccess")
        if (last_access is None) or (self.cur_time - last_access > 3600 * 24 * 120):  # delta in 4 month
            return

        host_name = row.get("Host")
        http_body = row.get("HttpBody")
        path = row.get("Path")

        if (http_body is not None) and (host_name is not None) and (path is not None):

            url_name = host_name[:]
            if (len(url_name) > 0) and (url_name[-1] == "/"):
                url_name = url_name[:-1]
            if (len(path) > 0) and (path[0] == "/"):
                url_name += path
            else:
                url_name += "/" + path

            title = self.get_title(http_body, charset)
            if len(title) > 0:
                yield {"URL": url_name, "Title": title}


# Hosts map function. Extract clear domain
def hosts_map_func(row):

    domain = row.get("Domain")
    if domain is not None:

        pos = domain.find('/')
        if pos > -1:
            domain = domain[:pos]

        if len(domain) > 0:
            pos = domain.find("www.")
            if pos == 0:
                domain = domain[4:]

            try:
                domain = domain.decode("utf-8").encode('idna')
            except Exception:
                pass

            yield {"ClearDomain": domain}


# Hosts reduce function. Generate host variants for each clear domain
def hosts_red_func(key, recs):

    clear_domain = key["ClearDomain"]

    for prefix in ["http://", "https://", ""]:
        for www in ["www.", ""]:
            yield {"HostVersion": prefix + www + clear_domain}


# Main function
if __name__ == "__main__":

    # Get YT config and YT clients
    main_config = yt.default_config.get_config_from_env()
    main_config["pool"] = POOL
    main_config["create_table_attributes"] = {"compression_codec": "lz4"}
    yt_main_client = yt.YtClient(proxy='hahn', config=main_config)
    yt_lock_client = yt.YtClient(proxy='locke', config=yt.default_config.get_config_from_env())
    yt_attr_client = yt.YtClient(proxy='locke', config=yt.default_config.get_config_from_env())

    # Get YQL client
    yql_client = get_client('hahn')

    # Find out last update time and set new (in case of starting new generation)
    with yt_lock_client.Transaction():

        try:
            yt_lock_client.lock(LOCK_YT_OBJECT)
        except Exception:
            exit()

        # Find first shard for handling (view ready-made tables)
        shard_start = 0
        final_tables_set = set()
        last_generation_start_attr = None

        for table_path in yt_main_client.search(MAIN_FOLDER, node_type=["table"]):
            path_split = table_path.split('/')

            if len(path_split) > 0:
                table_name = path_split[-1]
                search_res = re.search(r"^final_(\d{3})$", table_name)

                if search_res is not None:
                    shard_start = max(shard_start, int(search_res.group(1)) + 1)
                    final_tables_set.add(
                        "{main_folder}/{table}".format(main_folder=MAIN_FOLDER, table=search_res.group(0)))

        if shard_start == 0:
            try:
                last_script_start = yt_attr_client.get(ATTR_YT_OBJECT + "/@last_generating_start")
            except Exception:
                last_script_start = None

            if (last_script_start is not None) and (time() - last_script_start <= LAUNCH_TIME_DELTA):
                exit()

            last_generation_start_attr = time()

            # Get different combinations of hosts per each domain
            yt_main_client.remove(HOST_COMBINATIONS, force=True)
            yt_main_client.run_map_reduce(
                hosts_map_func,
                hosts_red_func,
                INPUT_TABLE,
                HOST_COMBINATIONS,
                reduce_by=['ClearDomain']
            )

        tmp_path_prefix = "//tmp/" + "".join([random.choice(ascii_letters + digits) for i in range(24)]) + "_dpd-kwyt"

        # Load hots' combinations to an YT file
        logger.error("Load hosts' combinations to tmp file...")
        host_combinations_file = os.path.abspath(irt.broadmatching.common_options.get_options()['dirs']['temp']) + "/host_versions_for_kwyt_banners"
        with open(host_combinations_file, "w") as wf:
            for row in yt_main_client.read_table(HOST_COMBINATIONS):
                wf.write(row["HostVersion"] + "\n")

        hosts_file = "{}_hosts_file".format(tmp_path_prefix)
        yt_main_client.create("file", hosts_file)
        yt_main_client.write_file(hosts_file, open(host_combinations_file))

        # Start KWYT shards handling
        # KWYT shards parallel processing is forbidden, use one shard at time
        for shard in range(shard_start, SHARDS_COUNT):
            page_name = str(shard).zfill(3)

            inter_table = "{tmp_path}_inter_{page_name}".format(tmp_path=tmp_path_prefix, page_name=page_name)
            final_table = "{main_folder}/final_{page_name}".format(main_folder=MAIN_FOLDER, page_name=page_name)
            final_tables_set.add(final_table)

            logger.info("Shard %s -> START", page_name)

            # Step 1. YQL request (INNER JOIN)
            request = """
                {pragma}
                PRAGMA yt.PublishedCompressionCodec = "lz4";
                PRAGMA File("host_combinations", "yt://hahn/{hosts_file}");

                INSERT INTO `{inter_table}`

                SELECT kwyt.Host AS Host, kwyt.Path AS Path, kwyt.HttpBody AS HttpBody, kwyt.LastAccess AS LastAccess,
                kwyt.HttpCode AS HttpCode, kwyt.IsLast AS IsLast, kwyt.MimeType AS MimeType, kwyt.Charset AS Charset

                FROM `//home/kwyt/pages/{page_name}/data` AS kwyt
                WHERE (kwyt.Host IN ParseFile("String", "host_combinations")) AND (kwyt.Host IS NOT NULL) AND (LENGTH(kwyt.Host) > 0)
            """.format(pragma=PRAGMA_STR, hosts_file=hosts_file.replace("//", ""), inter_table=inter_table, page_name=page_name)
            do_yql(yql_client, request, yt_pool=POOL, timeout_hour=TIMEOUT_HOUR)

            logger.info("Shard %s -> YQL 'INNER JOIN' REQUEST COMPLETED", page_name)

            # Step 2. YT map (extract <URL> - <Title>)
            with yt_main_client.Transaction():
                yt_main_client.lock(inter_table)
                yt_main_client.run_map(MapClass(), inter_table, final_table)
            yt_main_client.remove(inter_table)
            yt_main_client.run_merge(final_table, final_table, spec={"combine_chunks": True})

            logger.info("Shard %s -> OK!", page_name)

            if last_generation_start_attr:
                yt_attr_client.set(ATTR_YT_OBJECT + "/@last_generating_start", last_generation_start_attr)
                last_generation_start_attr = None

        # Union all shard results to the output table
        concat_tables = ", ".join('`' + final_table + '`' for final_table in final_tables_set)
        request = """
            {pragma}
            PRAGMA yt.PublishedCompressionCodec = "brotli_5";

            INSERT INTO `{result_table}` WITH TRUNCATE
            SELECT URL, Title
            FROM CONCAT({concat_tables});
        """.format(pragma=PRAGMA_STR, result_table=RESULT_TABLE, concat_tables=concat_tables)

        do_yql(yql_client, request, yt_pool=POOL, timeout_hour=TIMEOUT_HOUR)

        set_upload_time(RESULT_TABLE, yt_main_client)

        # Delete all intermediate tables
        for final_table in final_tables_set:
            yt_main_client.remove(final_table)

        # Set last update time in attributes
        yt_attr_client.set(ATTR_YT_OBJECT + "/@last_update_time", datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S'))

        # Done!
        logger.error("GET_RESOURCES_OK")
        solomon_client = SolomonAgentSensorsClient()
        solomon_client.set_success_script_finish("get_kwyt_source_url_title")
