# -*- coding: utf-8 -*-

import os
import logging
import random
import re

from sandbox import common
from sandbox import sdk2
from sandbox.sandboxsdk.environments import PipEnvironment
from sandbox.common.types.client import Tag
from sandbox.projects.vh.frontend import (
    VhDolbilkaRawRequests,
)


class VhGenerateRequestsFromYt(sdk2.Task):
    """
        Select max_request_number requests to videohosting from YT logs using ReservoirSampling.
        Generate file with requests for VhDolbilkaPlanCreator.
    """

    class Requirements(sdk2.Requirements):
        privileged = True
        client_tags = Tag.INTEL_E5_2650 & Tag.LXC & Tag.GENERIC
        execution_space = 10 * 1024
        required_ram = 16 * 1024
        environments = [
            PipEnvironment("yandex-yt", "0.9.26"),
        ]

    class Parameters(sdk2.Task.Parameters):
        yt_token_vault = sdk2.parameters.String(
            "YT_TOKEN vault name",
            name="yt_token_vault",
            default="yt_token_for_testenv",
            required=True,
        )
        yt_cluster = sdk2.parameters.String(
            "YT cluster (i.e. hahn)",
            name="yt_cluster",
            default="hahn",
            required=True,
        )
        request_log_path = sdk2.parameters.String(
            "Request logs path in YT",
            name="request_log_path",
            default="//home/videodev/fawnd2/http_adapter_logs",
            required=True,
        )
        max_request_number = sdk2.parameters.String(
            "maximum requests number",
            name="max_request_number",
            default="10000"
        )
        yt_logs_limit = sdk2.parameters.String(
            "number of rows in YT table with logs to read",
            name="yt_logs_limit",
            default="1000000"
        )
        handlers_whitelist = sdk2.parameters.List(
            "If not empty only handlers from whitelist will be shot, handlers should start with /",
        )
        handlers_blacklist = sdk2.parameters.List(
            "handlers from this blacklist will not be shot, handlers should start with /",
        )
        headers = sdk2.parameters.List(
            "headers for shooting",
            default=["Origin: yandex.ru"],
        )
        percentage_use_of_headers = sdk2.parameters.Integer(
            "percentage use of headers for shooting",
            name="percentage_use_of_headers",
            default="100",
        )

        with sdk2.parameters.Output():
            request_number = sdk2.parameters.Integer(
                "requests number",
            )
            raw_requests = sdk2.parameters.Resource(
                "requests to VhDolbilkaPlanCreator",
                name="raw_requests_file",
                resource_type=VhDolbilkaRawRequests
            )

    def on_execute(self):
        log_sample = list(self.get_log_samples_from_yt())
        logging.info("Successfully read logs from yt")

        self.Parameters.raw_requests = VhDolbilkaRawRequests(self, "requests to VhDolbilkaPlanCreator", "requests.txt")
        requests_file = sdk2.ResourceData(self.Parameters.raw_requests).path

        self.Parameters.request_number = len(log_sample)
        self.create_requests_file(log_sample, requests_file)
        logging.info("Successfully created requests file")

    def is_http_adapter_logs(self):
        return "http_adapter" in self.Parameters.request_log_path

    def get_url_from_balancer_log_row(self, log_row, regexp):
        request = log_row["url"]
        url = request.strip("\"").split(" ")[1]
        match = re.match(regexp, url)
        if match:
            url = url[match.end():]
        return url

    def get_url_from_http_adapter_log_row(self, log_row):
        return log_row["request"]

    def get_url(self, log_row, regexp):
        if self.is_http_adapter_logs():
            return self.get_url_from_http_adapter_log_row(log_row)
        else:
            return self.get_url_from_balancer_log_row(log_row, regexp)

    def get_headers_from_balancer_log_row(self, log_row, regexp):
        headers = []
        processing_tree = log_row["processing_tree"]
        search = re.search(regexp, processing_tree)
        if search:
            user_agent = search.group()[3:-3]
            headers.append(user_agent)
        return headers

    def get_headers_from_http_adapter_log_row(self, log_row, task_headers):
        headers = []
        task_keys = [header.split(':')[0].lower().strip() for header in task_headers]

        def _append_header_if_exist(log_key, header_key):
            if log_row.get(log_key, '-') != '-' and header_key.lower().strip() not in task_keys:
                headers.append("{}: {}".format(header_key, log_row[log_key]))

        _append_header_if_exist("cookies", "Cookie")
        _append_header_if_exist("x_yandex_internal_request", "X-Yandex-Internal-Request")
        _append_header_if_exist("user_agent", "User-Agent")
        _append_header_if_exist("x_forwarded_for", "X-Forwarded-For-Y")

        return headers

    def get_headers(self, log_row, regexp, task_headers):
        if self.is_http_adapter_logs():
            return self.get_headers_from_http_adapter_log_row(log_row, task_headers)
        else:
            return self.get_headers_from_balancer_log_row(log_row, regexp)

    def create_requests_file(self, log_sample, requests_file):
        logging.info("Start create requests file")
        regexp_prefix = re.compile('\/v[0-9]+')
        regexp_user_agent = re.compile('<::User-Agent.*::>')
        with open(str(requests_file), "w") as out_file:
            for log_row in log_sample:
                random_seed = random.randint(1, 10 ** 9)
                headers = ["X-random-seed: {random_seed}".format(random_seed=random_seed)]

                header_probability = random.random()
                for header in self.Parameters.headers:
                    if header_probability * 100 <= self.Parameters.percentage_use_of_headers:
                        headers.append(header)

                request_headers = self.get_headers(log_row, regexp_user_agent, self.Parameters.headers)
                for header in request_headers:
                    headers.append(header)

                url = self.get_url(log_row, regexp_prefix)
                plan_row = "{url}\t{headers}\n".format(
                    url=url,
                    headers="\\n".join(headers),
                )
                out_file.write(plan_row)

    def get_log_samples_from_yt(self):
        from yt.wrapper import JsonFormat, YtClient

        yt_token = sdk2.Vault.data(self.Parameters.yt_token_vault)
        yt_cluster = self.Parameters.yt_cluster
        client = YtClient(yt_cluster, yt_token)
        regexp_prefix = re.compile('\/v[0-9]+')

        log_path_suffix = client.list(self.Parameters.request_log_path)
        logging.info("Found logs: %s" % ", ".join(log_path_suffix))
        current_log_path_suffix = sorted(log_path_suffix)[-1]

        table_name = os.path.join(self.Parameters.request_log_path, current_log_path_suffix)
        logging.info("Used logs table: " + table_name)

        if client.exists(table_name):
            sample_size = int(self.Parameters.max_request_number)

            WEATHER_UUIDS = ["4539ed1505f61726a39631e3e289f0a1",
                             "4ee0a5b3ed0e753aab2f1ab836ed5fe6",
                             "479100e13e47eed29dc4226c204ee276",
                             "4acb3608ddec43c19a239888171677eb",
                             "4d5af6ad9fbb9451aff1faf40a6ecdbe",
                             "41fa55e88e8cab54bcd08d0b3fd5cfba",
                             "4e51be4bf9f83cdcb1d0c70982211199",
                             "470db51dda1f397a88c8b391cecd372c",
                             "418b30b6e2cd62ac8c35242fce98518d",
                             "464e8f8f817ed112ae4cbc3cb593981a",
                             "4f2aa42e1bca6b40a3ba23be4e7ace5f",
                             "4cf88d21a10dbfda9b588385f12d6f0d",
                             "4011c850fbd557668172f4f77185219c",
                             "408d213edf788dad85e56aa5bfe09bd6",
                             "4f942775aca910ac871c39e7a8a9ae81",
                             "48487d10ebecc57b80ac19c0132d2ea3",
                             "432aa9a630a12e7fbc5fe9c32c8e9d41",
                             "4f7bcceb68b8d43f876bda2912179953",
                             "4db47d4c96f9b8f5bf647f8310839496",
                             "46ebbb425b5a6a6493bcfabff7a5a38c",
                             "457bd504f25734a08c63375f4d454aa9",
                             "4799a654d1b1c520a1b685251b68f460",
                             "4638797eaf9404c581d09689f0aa1571",
                             "472ad3f8f46cb253b835e8d6a9e443e5",
                             "4ae8f96b01eea726802e43cc2505fbfd",
                             "45e7860a402176c381236770b2a7aa5e",
                             "45887986b94fcb8380119e1632d31887",
                             "48172d6679c147c6904172fac906f643",
                             "4dc0449fef7cbb8b8880286d6867beac",
                             "47d517e525272624909c0ce28a266536",
                             "4f40365f0d6e26e5b2cdbe1ddcea9b85",
                             "4583ace5fa2de7229fb5b105564f3848",
                             "4b57973b64c3e9ce8229471f9a04a7cf",
                             "4c2ae56ba90b75fc926861d37a22ecb8",
                             "419b1d501daa2c2f97f97c1ca7b786da",
                             "4bcf13e452f14d47b4448b79ff530e3f",
                             "4f31ff2324530307bd4cd2b28a6a8ee1"]

            def _is_accepted_row(handlers_whitelist, handlers_blacklist, row):
                url = self.get_url(row, regexp_prefix)
                if handlers_blacklist:
                    for handle in handlers_blacklist:
                        if url.startswith(handle):
                            return False
                if handlers_whitelist:
                    for handle in handlers_whitelist:
                        if url.startswith(handle):
                            return True
                    return False
                return True

            logging.info("Start ReservoirSampling")
            sample_rows = self.ReservoirSampling(sample_size)
            rows = client.read_table(table_name, format=JsonFormat())
            rows_counter_in_yt = 0

            for uuid in WEATHER_UUIDS:
                self.Parameters.handlers_blacklist.append("/vh/player-html?handler=public/player/" + uuid)

            for row in rows:
                if rows_counter_in_yt % 100000 == 0:
                    logging.info("Rows counter in yt: " + str(rows_counter_in_yt))

                if _is_accepted_row(self.Parameters.handlers_whitelist, self.Parameters.handlers_blacklist, row):
                    sample_rows.sample(row)

                if self.Parameters.yt_logs_limit:
                    if rows_counter_in_yt > int(self.Parameters.yt_logs_limit):
                        break

                rows_counter_in_yt += 1

            return sample_rows

        raise common.errors.TaskFailure("Table with logs not found")

    class ReservoirSampling(object):
        def __init__(self, sample_num):
            assert sample_num > 0
            self._sample_num = int(sample_num)
            self._reservoir = []
            self._pos = 0
            self._sample = self._no_sample

        def _no_sample(self, item):
            self._reservoir.append(item)
            if len(self._reservoir) >= self._sample_num:
                self._sample = self._naive_sample

        def _naive_sample(self, item):
            k = int(random.random() * self._pos)
            if k < self._sample_num:
                self._reservoir[k] = item

        def sample(self, item):
            self._pos += 1
            self._sample(item)

        def __iter__(self):
            return iter(self._reservoir)
