#!/usr/bin/python
# -*- coding: utf-8 -*-

import logging
import datetime
from collections import defaultdict

from sandbox import sdk2
from sandbox.common.errors import TaskFailure
from sandbox.sdk2.service_resources import SandboxTasksBinary
from sandbox.projects.websearch.begemot.tasks.BegemotYT.common import CommonYtParameters


class BegemotCreateResponsesDiffYTResource(sdk2.Resource):
    """ Test resources """


class BEGEMOT_DIFF_HTML_TEMPLATE(sdk2.Resource):
    """
    """


class BegemotCreateResponsesDiffYT(sdk2.Task):
    """
        Creates YT table with diffs from two begemot responses
    """

    description = 'Creates YT table with diffs from two begemot responses'
    _logger = logging.getLogger('TASK_LOGGER')
    _logger.setLevel(logging.DEBUG)

    class Parameters(sdk2.Task.Parameters):
        description = 'Creates diff from two begemot responces'
        input_path1 = sdk2.parameters.String(
            'Path to the responce of a First Begemot (JSON)',
            required=True,
        )
        input_path2 = sdk2.parameters.String(
            'Path to the responce of a Second Begemot (JSON)',
            required=True,
        )
        stat_path = sdk2.parameters.String(
            'Path to the stat file',
            default="",
        )
        output_path = sdk2.parameters.String('Output directory', required=True)

        max_size = sdk2.parameters.Integer(
            'Max size of output table from which diff html created(MB)',
            default=5,
        )
        yt_proxy = sdk2.parameters.String('YT_PROXY', default='hahn')
        yt_pool = sdk2.parameters.String('YT_POOL', default='robot-dev')
        yt_token_vault_owner = CommonYtParameters.yt_token_vault_owner()
        yt_token_vault_name = CommonYtParameters.yt_token_vault_name()
        html_template = sdk2.parameters.Resource(
            'html_template',
            resource_type=BEGEMOT_DIFF_HTML_TEMPLATE
        )
        kill_timeout = datetime.timedelta(hours=15).seconds
        wait_time = sdk2.parameters.Integer('Seconds between operations state check', default=30)
        fail_on_any_error = True
        with sdk2.parameters.Output:
            diff_table_path = sdk2.parameters.String('Path to the diff table')
            has_diff = sdk2.parameters.Bool('Whether responces have diff')

    class Requirements(sdk2.Task.Requirements):
        execution_space = 2 * 1024
        disk_space = 2 * 1024

    class Context(sdk2.Task.Context):
        operations = []
        operations_info = {}
        errors_count = 0
        wait_mode = False

    def on_create(self):
        attrs = {'task_type': 'BegemotCreateResponsesDiffYT'}
        archive_resource = SandboxTasksBinary.find(attrs=attrs).order(-sdk2.Resource.id).first()
        if archive_resource is not None:
            self.Requirements.tasks_resource = archive_resource.id

    def check_operations_state(self, yt_client):
        import yt.wrapper as yt
        for operation_id in self.Context.operations:
            operation = yt.Operation(operation_id, client=yt_client)
            info = {'state': operation.get_state().name, 'url': operation.url}
            info.update(operation.get_progress())
            self.Context.operations_info[operation_id] = info
            if not operation.get_state().is_finished():
                self._logger.info(self.Parameters.wait_time)
                raise sdk2.WaitTime(self.Parameters.wait_time)
            elif operation.get_state().is_unsuccessfully_finished():
                raise TaskFailure('Operation is unsuccessfully finished')

    def on_execute(self):
        import os
        import sys
        import diff
        import yt.wrapper as yt

        token = sdk2.Vault.data(self.Parameters.yt_token_vault_owner, name=self.Parameters.yt_token_vault_name)
        yt_client = yt.client.YtClient(self.Parameters.yt_proxy, token)
        yt_client.config['pickling']['module_filter'] = lambda module: 'hashlib' not in getattr(module, '__name__', '')
        yt_client.config['spec_defaults']['pool'] = self.Parameters.yt_pool

        if self.Parameters.stat_path:
            self._logger.info('Diff-ing stats')
            stats_dir = 'stats_diff'
            os.mkdir(stats_dir)
            self.get_and_publish_stats_diff(yt_client, stats_dir)
        self._logger.info('Starting diff')

        attrs = {'expiration_time': (datetime.datetime.now() + datetime.timedelta(days=7)).isoformat()}
        full_diff = yt.ypath_join(self.Parameters.output_path, 'full_diff')
        diff_by_path = yt.ypath_join(self.Parameters.output_path, 'diff_by_path')
        grouped_diff = yt.ypath_join(self.Parameters.output_path, 'grouped_diff')

        if len(self.Context.operations) == 0:
            yt.create('table', full_diff, attributes=attrs, recursive=True, client=yt_client)
            yt.create('table', diff_by_path, attributes=attrs, recursive=True, client=yt_client)
            operation = yt.run_map_reduce(
                None, diff.full_diff,
                source_table=[self.Parameters.input_path1, self.Parameters.input_path2],
                destination_table=[full_diff, diff_by_path],
                reduce_by='reqid', spec={'max_failed_job_count': 0}, client=yt_client, sync=False,
            )
            self.Context.operations.append(operation.id)
            self.check_operations_state(yt_client)

        self.check_operations_state(yt_client)
        if len(self.Context.operations) == 1:
            yt.create('table', grouped_diff, attributes=attrs, recursive=True, client=yt_client)
            self.Parameters.diff_table_path = grouped_diff
            operation = yt.run_map_reduce(
                None, diff.group, diff_by_path, grouped_diff, reduce_by='path', client=yt_client, sync=False
            )
            self.Context.operations.append(operation.id)
            self.check_operations_state(yt_client)

        self.Parameters.has_diff = yt_client.row_count(full_diff) > 0
        size = yt_client.get_attribute(full_diff, 'uncompressed_data_size')
        self._logger.debug('table size: %d' % size)

        if self.Parameters.has_diff:
            too_big_size = False
            out_dir = 'test_resource'
            responses_change = yt_client.row_count(full_diff) / yt_client.row_count(self.Parameters.input_path1)
            self.set_info('Diff: %.2f%%' % (responses_change * 100))
            if not self.Parameters.html_template:
                return
            # reading template
            html_template_path = str(sdk2.ResourceData(self.Parameters.html_template).path)
            with open(html_template_path, 'rb') as f:
                template = f.read().decode('utf-8')
                f.close()
            # concat diffs
            os.mkdir(out_dir)
            output_html = []
            rows = yt_client.read_table(full_diff)
            diff_size = 0
            for row in rows:
                cur_html_diff = row['htmlDiff'].decode('utf-8')
                cur_html_diff = cur_html_diff.replace(u'<html>', u'').replace(u'</html>', u'')
                counter = "<h1>Count x {} </h1>".format(str(row['count']))
                counter = unicode(counter)
                diff_size += sys.getsizeof(cur_html_diff) + sys.getsizeof(counter)
                if diff_size > self.Parameters.max_size * int(1e6):
                    too_big_size = True
                    break
                output_html.append(counter)
                output_html.append(cur_html_diff)

            if too_big_size:
                output_html = [u'Too big size!']
            with open(out_dir + '/diff.html', 'wb') as f:
                htmldiff = template.replace(u'$CONTENT', u"\n".join(output_html))
                f.write(htmldiff.encode('utf-8'))
                f.close()
            resource = BegemotCreateResponsesDiffYTResource(self, 'Test Resource', out_dir)
            resource_data = sdk2.ResourceData(resource)
            resource_data.ready()

    def get_table_url(self, table_path):
        return 'https://yt.yandex-team.ru/%s/#page=navigation&path=%s' % (self.Parameters.yt_proxy, table_path)

    def get_and_publish_stats_diff(self, yt_client, out_dir):
        """ Creates diff for stats: stat_path should be non-empty """

        import json
        cachehit_results = dict()
        jobs_info = yt_client.read_table(self.Parameters.stat_path)
        for job_info in jobs_info:
            info = json.loads(job_info['result'])
            for key, data in info:
                if 'CACHEHIT' in key:
                    if key not in cachehit_results:
                        cachehit_results[key] = list()
                    cachehit_results[key].append(data)

        cachehit_medians = dict()
        for key in cachehit_results:
            n = len(cachehit_results[key])
            cachehit_results[key] = sorted(cachehit_results[key])
            cachehit_medians[key] = cachehit_results[key][n//2]

        with open(out_dir + '/stats_diff', 'wb') as f:
            statsdiff = json.dumps(cachehit_medians, encoding='utf-8')
            f.write(statsdiff)
            f.close()
        resource_data = sdk2.ResourceData(BegemotCreateResponsesDiffYTResource(self, 'Stats diff', out_dir))
        resource_data.ready()

    @sdk2.header()
    def header(self):
        header = ''
        if self.Parameters.diff_table_path:
            header += '<a href="%s">Diff Table in YT</a><br/>' % self.get_table_url(self.Parameters.diff_table_path)
        for operation_id in self.Context.operations:
            if operation_id in self.Context.operations_info:
                header += '''
                        <a href="%(url)s">%(state)s</a><br/>
                        Pending: %(pending)s Running: %(running)s Completed: %(completed)s
                        Failed: %(failed)s Total: %(total)s<br/>
                    ''' % defaultdict(lambda: '0', self.Context.operations_info[operation_id])
        return header
