# -*- coding: utf-8 -*-

from sandbox import sdk2 as sdk
import sandbox.sdk2.helpers as helpers
from sandbox.sdk2.helpers import subprocess as sp

from sandbox.projects import resource_types as rt
from sandbox.projects.app_host import resources as app_host_resources
from sandbox.projects.common.dolbilka import resources as dolbilka_resources

from sandbox import common
import sandbox.common.types.resource as ctr
import sandbox.common.types.client as ctc

import os
import shutil
import tarfile
import urllib2
import urllib
import re
import json
import time
import logging
import contextlib


class UpdateTestenvNewsdResources(sdk.Task):
    """
        Скачивает и парсит eventlog'и с продакшена, генерирует список запросов к slave_newsd
        Cкачивает текущую продакшен базу slave_newsd
    """

    class Requirements(sdk.Task.Requirements):
        disk_space = 80 * 1024
        ram = 20 * 1024
        client_tags = ctc.Tag.Group.LINUX

    class Parameters(sdk.Task.Parameters):
        max_restarts = 1
        kill_timeout = 5 * 3600
        description = "Update resources for slave_newsd's tests in TestEnv"

        apphost_bundle = sdk.parameters.LastReleasedResource(
            "Apphost bundle with evlogdump",
            resource_type=rt.STABLE_NEWS_APP_HOST_BUNDLE,
            state=ctr.State.READY,
            required=True
        )

        apphost_instances = sdk.parameters.String(
            'News AppHost instances',
            default='production_app_host_news_dynamic',
            required=True
        )

        apphost_in_porto = sdk.parameters.Bool(
            "Assume that News AppHost instances run in porto containers",
            default=True
        )

        noapach_evlogdump = sdk.parameters.LastReleasedResource(
            "Noapach evlogdump executable",
            resource_type=rt.EVLOGDUMP_EXECUTABLE,
            state=ctr.State.READY,
            required=True
        )

        noapach_instances = sdk.parameters.String(
            'News noapach service',
            default='production_news_noapache',
            required=True
        )

        indexer_hosts = sdk.parameters.String(
            'News indexer hosts',
            default='nind00.search.yandex.net,nind04.search.yandex.net,nind05.search.yandex.net,nind15.search.yandex.net,nind17.search.yandex.net',
            required=True
        )

        bad_flags = sdk.parameters.Dict(
            "Slave flags blacklist (use custom value ALL_VALUES to ignore all flag's values)",
            required=False,
            default={}
        )

        bad_string_in_queries = sdk.parameters.List(
            "We don't want these substrings in full_url",
            required=False,
            default=[]
        )

        make_tank_ammo = sdk.parameters.LastReleasedResource(
            "AppHost tool make_tank_ammo",
            resource_type=app_host_resources.APP_HOST_TOOL_MAKE_TANK_AMMO_EXECUTABLE,
            state=ctr.State.READY,
            required=True
        )

        dolbilka_planner = sdk.parameters.LastReleasedResource(
            "d-planner tool from Dolbilka",
            resource_type=dolbilka_resources.DPLANNER_EXECUTABLE,
            state=ctr.State.READY,
            required=True
        )

        ssh_key = sdk.parameters.String(
            "SSH key vault (in the form of owner:name).",
            default="robot-ynews:ssh_key",
            required=False
        )

    class Context(sdk.Task.Context):
        pass

    def make_dir(self, name):
        directory = str(self.path(name))
        if os.path.exists(directory):
            shutil.rmtree(directory)
        os.makedirs(directory)
        return directory

    def get_instances_for_service(self, service_name):
        instances = []
        url = 'http://nanny.yandex-team.ru/v2/services/{}/current_state/instances/'.format(service_name)
        retry = 3
        while retry > 0:
            retry = retry - 1
            try:
                result = urllib2.urlopen(url)
                response = result.read()
                js = json.loads(response)
                for entry in js['result']:
                    instance = entry['hostname'] + ':' + str(entry['port'])
                    instances.append(instance)
                break
            except urllib2.HTTPError:
                time.sleep(10)

        if len(instances) == 0:
            raise common.errors.TaskError("Failed to get hosts for service " + service_name)

        return instances

    def get_instances_for_porto_service(self, service_name):
        instances_filename = "instances_{}.txt".format(service_name)
        container_selector = 'f@{}'.format(service_name)
        with helpers.ProcessLog(self, logger="get_instances_for_porto_service.log.log") as pl:
            with open(instances_filename, 'w') as instances_file:
                p = sp.Popen(["sky", "listinstances", "--no_tag", "--format", "ls", container_selector], stdout=instances_file, stderr=pl.stderr)
            if p.wait() != 0:
                raise common.errors.TaskError("Failed to get hosts for service " + service_name + ": `sky list` run failed with code {}".format(p.returncode))
        instances = set()
        with open(instances_filename, 'r') as instances_file:
            for line in instances_file:
                instance = line.strip()
                if not instance:
                    continue
                if (':' not in instance) or any(c in instance for c in ' '):
                    logging.error("Broken instance record: '%s'", instance)
                    continue
                instances.add(instance)
        return list(instances)

    def get_eventlogs_from_porto(self, log_suffix, instances, service_name, logs_count=5):
        none_context = contextlib.contextmanager(lambda: iter([None]))()
        if self.Parameters.ssh_key:
            ssh_key_owner, ssh_key_name = self.Parameters.ssh_key.split(':')
        else:
            ssh_key_owner, ssh_key_name = None, None

        directory = self.make_dir(log_suffix)
        counter = 0
        with helpers.ProcessLog(self, logger="scp_from_porto_"+log_suffix) as pl, (sdk.ssh.Key(self, key_owner=ssh_key_owner, key_name=ssh_key_name) if ssh_key_name else none_context):
            for instance in instances:
                if counter >= logs_count:
                    break
                logging.debug("Trying to fetch eventlog from '%s'", instance)

                (host, port) = instance.split(":")

                eventlog_basename = 'current-eventlog-{log_suffix}-{port}'.format(log_suffix=log_suffix, port=port)
                remote_eventlog_path = '/logs/{eventlog_basename}'.format(eventlog_basename=eventlog_basename)
                local_eventlog_path = os.path.join(directory, '{}@{}'.format(eventlog_basename, host))

                cmd = [
                    'scp',
                    '-P', '10046',  # porto ssh
                    '-o', 'StrictHostKeyChecking=no',
                    '-o', 'UserKnownHostsFile=/dev/null',
                    '-o', 'User=//user:{user}//slot:{port}@{host}'.format(user=ssh_key_owner, port=port, host=host),
                    '{host}:{remote_eventlog_path}'.format(host=host, remote_eventlog_path=remote_eventlog_path),
                    local_eventlog_path,
                ]

                try:
                    sp.check_call(cmd, stderr=pl.stderr, stdout=pl.stdout)
                    counter = counter + 1
                except:
                    logging.error("Failed to download event log from " + host)
                    pass
        if counter < logs_count:
            raise common.errors.TaskError("Not enough logs for {} ({}/{})".format(log_suffix, counter, logs_count))
        return directory

    def get_eventlogs(self, log_suffix, instances, logs_count=5):
        directory = self.make_dir(log_suffix)
        counter = 0
        with helpers.ProcessLog(self, logger="rsync_"+log_suffix) as pl:
            for instance in instances:
                if counter >= logs_count:
                    break

                (host, port) = instance.split(":")
                cmd = [
                    'rsync',
                    host + "::logs/current-eventlog-" + log_suffix + "-" + port,
                    directory + "/" + host
                ]

                try:
                    sp.check_call(cmd, stderr=pl.stdout, stdout=pl.stdout)
                    counter = counter + 1
                except:
                    logging.error("Failed to download event log from " + host)
                    pass
        if counter < logs_count:
            raise common.errors.TaskError("Not enough logs for " + log_suffix)
        return directory

    def find_file(self, name, path):
        for root, dirs, files in os.walk(path):
            if name in files:
                return os.path.join(root, name)
            elif dirs is not None:
                for d in dirs:
                    f = self.find_file(name, os.path.join(root, d))
                    if f is not None:
                        return f

    def bad_query(self, req):
        if self.Parameters.bad_flags or self.Parameters.bad_string_in_queries:
            try:
                req_json = json.loads(req)
                routerd = next((x for x in req_json if x['name'] == 'NEWS_ROUTERD'))['results']
                flags = next((x for x in routerd if x['type'] == 'flags'))['all']
                for key, val in self.Parameters.bad_flags.iteritems():
                    if key in flags and (str(flags[key]) == val or val == "ALL_VALUES"):
                        return True
                full_url = next((x for x in routerd if x['type'] == 'request'))['full_url']
                for bad_substr in self.Parameters.bad_string_in_queries:
                    if bad_substr in full_url:
                        return True
            except Exception as e:
                logging.error(repr(e))
                pass
        return False

    def create_apphost_requests(self):
        apphost_bundle_dir = self.make_dir("apphost_bundle")
        apphost_bundle = str(sdk.ResourceData(self.Parameters.apphost_bundle).path)
        tar = tarfile.open(apphost_bundle)
        tar.extractall(apphost_bundle_dir)
        evlogdump = self.find_file("event_log_dump", apphost_bundle_dir)
        if evlogdump is None:
            raise common.errors.TaskError("Failed to get AppHost evlogdump")

        if self.Parameters.apphost_in_porto:
            apphost_instances = self.get_instances_for_porto_service(self.Parameters.apphost_instances)
            apphost_dir = self.get_eventlogs_from_porto("app_host", apphost_instances, self.Parameters.apphost_instances, logs_count=5)
        else:
            apphost_instances = self.get_instances_for_service(self.Parameters.apphost_instances)
            apphost_dir = self.get_eventlogs("app_host", apphost_instances, 5)

        newsd_resource = sdk.ResourceData(
            rt.SLAVE_NEWSD_APPHOST_REQUESTS(
                self,
                'slave_newsd apphost queries for state {}'.format(self.Context.state_resource_id),
                "apphost_requests.json",
                ttl=60,
                slave_newsd_testenv_base="yes"
            )
        )
        routerd_resource = sdk.ResourceData(
            rt.ROUTERD_REQUESTS(
                self,
                'routerd apphost queries',
                "routerd_requests.json",
                ttl=60,
                timestamp=int(time.time()),
                routerd_testenv_base="yes"
            )
        )

        newsd_requests_filename = "apphost_requests.500.json"
        routerd_requests_filename = "routerd_requests.500.json"

        count = 0
        max_count = 250000 * 2
        with open(newsd_requests_filename, 'w') as slave, \
             open(routerd_requests_filename, 'w') as routerd:
            for root, dirs, files in os.walk(apphost_dir):
                for log in files:
                    log_name = os.path.join(root, log)
                    logging.info("{}: {}".format(log_name, os.stat(log_name)))
                    cmd = [
                        evlogdump,
                        log_name
                    ]
                    p = sp.Popen(cmd, bufsize=-1, stdout=sp.PIPE)
                    fd = None
                    for line in p.stdout:
                        if count > max_count:
                            break
                        f = line.rstrip('\n').split('\t')
                        if f[2] == 'TStartRequest':
                            if f[4] == '/news-all':  # 'HTTP_REQUEST'
                                fd = routerd
                            elif f[4].startswith('_subhost/news-all-core'):  # 'NEWS_ROUTERD'
                                fd = slave
                        elif f[2] == 'TInputDump':
                            if not self.bad_query(f[3]):
                                fd.write(f[3] + "\n")
                                count += 1
                    p.stdout.close()
                    p.wait()

        logging.info("newsd_requests_filename stat: {}".format(os.stat(newsd_requests_filename)))
        logging.info("routerd_requests_filename stat: {}".format(os.stat(routerd_requests_filename)))
        if os.stat(newsd_requests_filename).st_size == 0:
            raise common.errors.TaskError("Failed to find requests for newsd")
        if os.stat(routerd_requests_filename).st_size == 0:
            raise common.errors.TaskError("Failed to find requests for routerd")

        def head(input_file, output_file, count):
            index = 0
            with open(input_file) as in_fd, open(output_file, 'w') as out_fd:
                for line in in_fd:
                    if index >= count:
                        break
                    out_fd.write(line)
                    index = index + 1

        head(newsd_requests_filename, str(newsd_resource.path), 50000)
        head(routerd_requests_filename, str(routerd_resource.path), 50000)

        newsd_plan_resource = sdk.ResourceData(
            rt.BASESEARCH_PLAN(
                self,
                'slave_newsd apphost plan for perfomance test for state {}'.format(self.Context.state_resource_id),
                "apphost_slave_newsd_dolbilo.plan",
                ttl=60,
                slave_newsd_testenv_base="yes")
            )
        self.make_perfomance_plan(newsd_requests_filename, str(newsd_plan_resource.path))

        routerd_plan_resource = sdk.ResourceData(
            rt.BASESEARCH_PLAN(
                self,
                'routerd plan for perfomance test',
                "routerd_dolbilo.plan",
                ttl=60,
                routerd_testenv_base="yes")
            )
        self.make_perfomance_plan(routerd_requests_filename, str(routerd_plan_resource.path))

    def make_perfomance_plan(self, requests_file, output_file):
        make_tank_ammo = str(sdk.ResourceData(self.Parameters.make_tank_ammo).path)
        ammo_cmd = [
            make_tank_ammo,
            "--addr", "localhost:17171",
            "-i", requests_file,
        ]
        with helpers.ProcessLog(self, logger=requests_file+".make_tank_ammo.log") as pl:
            sp.check_call(ammo_cmd, stdout=pl.stdout, stderr=sp.STDOUT)

        d_planner = str(sdk.ResourceData(self.Parameters.dolbilka_planner).path)
        planner_cmd = [
            d_planner,
            "-l", requests_file+".ammo",
            "-o", output_file,
            "-t", "phantom"
        ]
        with helpers.ProcessLog(self, logger=requests_file+".d_planner.log") as pl:
            sp.check_call(planner_cmd, stdout=pl.stdout, stderr=sp.STDOUT)

    def create_plain_requests(self):
        evlogdump = str(sdk.ResourceData(self.Parameters.noapach_evlogdump).path)

        noapach_instances = self.get_instances_for_service(self.Parameters.noapach_instances)
        noapach_dir = self.get_eventlogs("noapache", noapach_instances, 5)

        resource_data = sdk.ResourceData(
            rt.PLAIN_TEXT_QUERIES(
                self,
                'slave_newsd queries for state {}'.format(self.Context.state_resource_id),
                "noapach_requests.txt",
                ttl=60,
                slave_newsd_testenv_base="yes")
            )
        count = 0
        max_count = 50000
        with open(str(resource_data.path), 'w') as output, \
             open("plain_requests_for_slave.txt", 'wb') as output_with_url:
            seen = {}
            for root, dirs, files in os.walk(noapach_dir):
                for log in files:
                    cmd = [
                        evlogdump,
                        os.path.join(root, log)
                    ]
                    p = sp.Popen(cmd, bufsize=-1, stdout=sp.PIPE)
                    newsd_source = None
                    for line in p.stdout:
                        if count > max_count:
                            break
                        f = line.rstrip('\n').split('\t')
                        if newsd_source is not None and newsd_source == f[3]:
                            newsd_source = None
                            request = f[7]
                            relev = None
                            cgi = request.split('&')
                            for p in cgi:
                                (key, value) = p.split('=')
                                if key == 'relev':
                                    relev = urllib.unquote(value)
                            if relev is not None:
                                publish = False
                                params = relev.split(';')
                                for param in params:
                                    (key, value) = param.split('=')
                                    if key == "newsd_query":
                                        value = urllib.unquote(value)
                                        clean = re.sub("user%3D\d+%26", "", value)
                                        clean = re.sub("lr=\d+&", "", clean)
                                        if clean in seen:
                                            continue
                                        seen[clean] = 1
                                        publish = True

                                if publish:
                                    request = re.sub("http2?://[^/]+/yandsearch", "", request)
                                    output.write(request + "\n")
                                    output_with_url.write("/yandsearch" + request + "\n")
                                    count = count + 1
                        elif len(f) >= 7 and f[6] == "NEWSD_RUSSIAN":
                            newsd_source = f[3]
        newsd_plain_plan_resource = sdk.ResourceData(
            rt.BASESEARCH_PLAN(
                self,
                'slave_newsd plan for perfomance test for state {}'.format(self.Context.state_resource_id),
                "slave_newsd_dolbilo.plan",
                ttl=60,
                slave_newsd_testenv_baseplan_plain="yes"
            )
        )
        d_planner = str(sdk.ResourceData(self.Parameters.dolbilka_planner).path)
        planner_cmd = [
            d_planner,
            "-l", "plain_requests_for_slave.txt",
            "-o", str(newsd_plain_plan_resource.path),
            "-t", "plain",
            "-h", "localhost",
            "-p", "17171"
        ]
        with helpers.ProcessLog(self, logger="plain_requests_for_slave.d_planner.log") as pl:
            sp.check_call(planner_cmd, stdout=pl.stdout, stderr=sp.STDOUT)

    def get_active_host(self):
        active = None
        hosts = self.Parameters.indexer_hosts.split(",")
        for host in hosts:
            if active is None:
                active = host
            url = "http://" + host + ":3000/state/active"
            try:
                response = urllib2.urlopen(url)
                status = response.read()
                if status is not None and int(status) == 1:
                    active = host
                    break
            except urllib2.HTTPError:
                pass
            except urllib2.URLError:
                pass

        if active is None:
            raise common.errors.TaskError("Failed to find active News indexer host")
        else:
            logging.info("Active host: %s", active)
        return active

    def get_state(self):
        active_host = self.get_active_host()
        url = "http://" + active_host + ":1985/dump"

        resource = rt.SLAVE_NEWSD_STATE(
            self,
            'slave_newsd state for response tests',
            "slave_newsd_state.bin",
            ttl=60,
            slave_newsd_testenv_base="yes",
            timestamp=int(time.time()),
        )
        resource_data = sdk.ResourceData(resource)

        self.Context.state_resource_id = resource.id

        with open(str(resource_data.path), 'wb') as fd:
            try:
                state = urllib2.urlopen(url)
                fd.write(state.read())
            except urllib2.HTTPError as e:
                raise common.errors.TaskError(
                    "Failed to fetch state from master_newsd: " +
                    str(e.code) + ", " + str(e.read())
                )

    def on_execute(self):
        self.get_state()

        self.create_apphost_requests()
        self.create_plain_requests()
