import os
import os.path
import json
import requests
import logging
import subprocess
from random import shuffle
from sandbox import sandboxsdk
from sandbox.sandboxsdk.svn import Arcadia
from sandbox.projects import resource_types
from datetime import datetime, timedelta
from sandbox.projects.release_machine import rm_notify
from sandbox.projects.common import utils
from sandbox.projects.geosearch.tools.nanny import Nanny


class Service(sandboxsdk.parameters.SandboxSelectParameter):
    name = 'service'
    description = 'Collect requests for'
    required = True
    choices = [('Business', 'Business'),
               ('Wiki', 'Wiki'),
               ('Geocoder', 'Geocoder'),
               ('Middle', 'Geo'),
               ('Upper', 'Upper'),
               ('Objects', 'Objects')]


class Location(sandboxsdk.parameters.SandboxSelectParameter):
    name = 'location'
    description = 'Collect requests from location'
    required = True
    choices = [
        ('SAS_ADDRS_NEW', 'SAS'),
        ('MAN_ADDRS_NEW', 'MAN'),
        ('VLA_ADDRS_NEW', 'VLA')
    ]


class TimeInterval(sandboxsdk.parameters.SandboxStringParameter):
    name = 'time_interval'
    description = 'Time interval in seconds'
    default_value = '15'


class FallbackContainerHost(sandboxsdk.parameters.SandboxStringParameter):
    name = 'fallback_container_host'
    description = 'container host if no one was found via nanny'
    default_value = 'sas1-1184-sas-addrs-mmeta-new-8051.gencfg-c.yandex.net'


class NumberOfHosts(sandboxsdk.parameters.SandboxStringParameter):
    name = 'number_of_hosts'
    description = 'Limit number of hosts (0 for all hosts)'
    default_value = '10'


class NumberOfRequests(sandboxsdk.parameters.SandboxIntegerParameter):
    name = 'number_of_requests'
    description = 'Limit number of requests'
    required = False


class SampleOriginsUniformly(sandboxsdk.parameters.SandboxBoolParameter):
    name = 'sample_origins_uniformly'
    description = 'Try to collect the same number of requests for each origin'
    required = False
    default_value = True


# Service name is used, if no regex is given
SOURCE_REGEX = {
    'Geo': '^Geo$',
    'Objects': '^Objects(Geo)?$'
}


@rm_notify.notify2()
class GetAddrsRequests(sandboxsdk.task.SandboxTask, object):
    '''Get requests from addrs production'''

    type = 'GET_ADDRS_REQUESTS'

    execution_space = 100 << 10

    input_parameters = [Service,
                        Location,
                        FallbackContainerHost,
                        TimeInterval,
                        NumberOfHosts,
                        NumberOfRequests,
                        SampleOriginsUniformly]

    def get_evlogdump(self):
        return self.sync_resource('755679578')

    def get_time_limits(self):
        template = '%Y-%m-%dT%H:%M:%S'
        start = self.rsync_time - timedelta(seconds=self.time_interval)
        start = start.strftime(template)
        return start

    def dump_to_txt(self, eventlog_file):
        log_txt = eventlog_file + '.txt'
        self.start = self.get_time_limits()
        cmd = "%s -s %s -i SubSourceRequest %s > %s" % (self.evlogdump,
                                                        self.start,
                                                        eventlog_file,
                                                        log_txt)
        logging.info('Running %s' % cmd)
        try:
            subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
        except subprocess.CalledProcessError as err:
            logging.info(err.output)
        return log_txt

    def get_logfile(self, host):
        try:
            os.mkdir(self.logs_dir)
        except OSError:
            logging.info('%s already exist' % self.logs_dir)
        with sandboxsdk.ssh.Key(self, "GEOMETA-SEARCH", "robot-geosearch-ssh"):
            logging.info('Getting log file from %s' % host)
            remote_file_name = 'current-eventlog-addrs{service}-80'.format(service='middle' if self.nanny_service == 'addrs_middle' else 'upper')
            local_file_name = '{host}_{f_name}'.format(host=host, f_name=remote_file_name)
            local_file_path = os.path.join(self.logs_dir, local_file_name)
            cmd_tpl = 'rsync -av -e "ssh -l  \'//user:robot-geosearch//configuration_id:{nanny_service}#{conf_id}\'" {host}:/logs/{remote_name} {local_name}'
            cmd = cmd_tpl.format(nanny_service=self.nanny_service, conf_id=self.conf_id, host=host, remote_name=remote_file_name, local_name=local_file_path)
            self.rsync_time = datetime.now()
            try:
                logging.info('Started %s at %s' % (cmd, self.rsync_time))
                output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)
                logging.info('Rsync output: {}'.format(output))
                return local_file_path
            except subprocess.CalledProcessError:
                logging.exception('Failed to get logfile from {}'.format(host))

    def get_log_numbers(self, eventlog_file):
        cmd = ("%s -s %s -i SubSourceInit %s |"
               " awk '$7 ~ /%s/ {print $4}'" % (self.evlogdump,
                                                self.start,
                                                eventlog_file,
                                                SOURCE_REGEX.get(self.service, self.service)))
        logging.info('Running %s' % cmd)
        try:
            proc_out = subprocess.check_output(cmd,
                                               stderr=subprocess.STDOUT,
                                               shell=True)
            numbers = set(proc_out.splitlines())
            return list(numbers)
        except subprocess.CalledProcessError as err:
            logging.info(err.output)

    def get_lines(self, log_file):
        logging.info('numbers = %s' % self.subsource_numbers)
        if self.subsource_numbers:
            statement = '||'.join('($4=="{}")'.format(number) for number in self.subsource_numbers)
            cmd = "<%s awk '%s {print $8}'" % (log_file, statement)
            logging.info('running %s' % cmd)
            proc_out = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
            full_reqs = set(proc_out.splitlines())
            logging.info('Number of requests: %s' % len(full_reqs))
            reqs = []
            its_params = self.get_its_params()
            for req in full_reqs:
                try:
                    if req.endswith('[cut]'):
                        continue
                    uri = req.split('/', 3)[3]
                    if self.service == 'Geo':
                        uri = '%s&%s' % (uri, its_params)
                    reqs.append('/%s\n' % uri)
                except IndexError:
                    continue
            with open(self.temporary_file, 'a') as temp_file:
                temp_file.writelines(reqs)

    def get_upper_requests(self, eventlog_file):
        self.start = self.get_time_limits()
        cmd = "%s -s %s -i ContextCreated %s | cut -f 4" % (
            self.evlogdump,
            self.start,
            eventlog_file)
        logging.info('Running %s' % cmd)
        try:
            proc_out = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
        except subprocess.CalledProcessError as err:
            logging.info(err.output)
        reqs = proc_out.splitlines()
        logging.info('Number of requests: %s' % len(reqs))
        its_params = self.get_its_params()
        for i in xrange(len(reqs)):
            reqs[i] = '/yandsearch?%s&middle_%s\n' % (reqs[i], its_params)
        with open(self.temporary_file, 'a') as temp_file:
            temp_file.writelines(reqs)

    def get_requests(self, host):
        eventlog = self.get_logfile(host)
        self.start = self.get_time_limits()
        if self.service == 'Upper':
            self.get_upper_requests(eventlog)
        else:
            self.subsource_numbers = self.get_log_numbers(eventlog)
            subsource_requests = self.dump_to_txt(eventlog)
            logging.info(os.listdir('./production_logs'))
            self.get_lines(subsource_requests)
            os.remove(subsource_requests)
        os.remove(eventlog)

    def sample_origins_uniformly(self):
        '''
        Reads from self.temporary_file and writes to self.result_file.
        '''
        svn_url = 'arcadia:/arc/trunk/arcadia/search/geo/tools/get_queries_for_sandbox/'
        script_dir = Arcadia.get_arcadia_src_dir(svn_url)
        script_path = os.path.join(script_dir, 'choose_random_queries.py')

        cmd = [script_path]
        number_of_requests = self.ctx.get(NumberOfRequests.name)
        if number_of_requests:
            cmd.extend(['--num-queries', str(number_of_requests)])
        cmd.append(self.temporary_file)

        with open(self.result_file, 'w') as res:
            logging.info('Running %s', cmd)
            subprocess.check_call(cmd, shell=False, stdout=res)

    def randomize(self):
        '''
        Reads from self.temporary_file and writes to self.result_file.
        '''
        cmd = ['/usr/bin/shuf']
        number_of_requests = self.ctx.get(NumberOfRequests.name)
        if number_of_requests:
            cmd.extend(['--head-count', str(number_of_requests)])
        cmd.append(self.temporary_file)

        logging.info('Running %s', cmd)
        with open(self.result_file, 'w') as res:
            subprocess.check_call(cmd, shell=False, stdout=res)

    def get_its_params(self):
        url = 'http://its.yandex-team.ru/v1/process/'
        headers = {'Accept': 'application/json',
                   'Content-Type': 'application/json; charset=utf-8'}
        data = ['a_itype_addrsupper',
                'a_ctype_test',
                'a_prj_addrs',
                'a_geo_sas']
        request = requests.post(url, headers=headers, data=json.dumps(data))
        data = request.json()
        return data.get('./degrade')

    def on_execute(self):
        self.nanny = Nanny(self.get_vault_data('robot-geosearch', 'ADDRS'))
        self.evlogdump = self.get_evlogdump()
        self.logs_dir = './production_logs'
        self.service = self.ctx.get(Service.name)
        if self.service == 'Geo' or self.service == 'Upper':
            self.nanny_service = 'addrs_upper_yp'
        else:
            self.nanny_service = 'addrs_middle'
        self.location = self.ctx.get(Location.name).lower()
        self.time_interval = int(self.ctx.get(TimeInterval.name))
        self.number_of_hosts = int(self.ctx.get(NumberOfHosts.name))
        self.temporary_file = 'tmp_requests.txt'
        self.result_file = 'requests.txt'
        logging.info('Collecting requests from {} in {}'.format(self.nanny_service, self.location))
        self.hosts = self.nanny.get_isolated_instances(self.nanny_service, self.location)
        shuffle(self.hosts)
        self.conf_id = self.nanny.get_configuration_id(self.nanny_service)
        logging.info('{} configuration ID is: {}'.format(self.nanny_service, self.conf_id))
        if self.number_of_hosts != 0:
            self.hosts = self.hosts[:self.number_of_hosts]
            logging.info('Hosts: {}'.format(self.hosts))
        for host in self.hosts:
            self.get_requests(host)
        if utils.get_or_default(self.ctx, SampleOriginsUniformly):
            self.sample_origins_uniformly()
        else:
            self.randomize()
        self.create_resource(description='Addrs %s requests' % self.service,
                             resource_path=self.result_file,
                             resource_type=resource_types.PLAIN_TEXT_QUERIES,
                             attributes={'service': 'addrs_%s' % self.service.lower(),
                                         'created': datetime.now().strftime('%Y-%m-%dT%H:%M')})
