# -*- coding: utf-8 -*-

import os

from sandbox.sandboxsdk.process import run_process
from sandbox.sandboxsdk.task import SandboxTask
from sandbox.sandboxsdk.parameters import ResourceSelector, SandboxStringParameter
from sandbox.projects import resource_types


class MatrixnetFirstTestParameter(ResourceSelector):
    name = 'matrixnet_first_test_resource_id'
    description = 'matrixnet first test results'
    resource_type = 'MATRIXNET_ALLTESTS_RESULTS'
    required = True


class MatrixnetSecondTestParameter(ResourceSelector):
    name = 'matrixnet_second_test_resource_id'
    description = 'matrixnet second test results'
    resource_type = 'MATRIXNET_ALLTESTS_RESULTS'
    required = False


class MatrixnetTestInternalFilesPairs(SandboxStringParameter):
    name = 'matrixnet_test_files_pairs'
    description = '''string containing list of pairs of files to test inside first archive in format filename1;filename2\nfilename3;filename4'''
    default_value = ''
    multiline = True


class CheckMatrixnetPredictionsDifference(SandboxTask):
    """
    **Описание**
    Сравнение моделей матрикснета между двумя ревизиями
    Сравниваются предсказания на нескольких тестовых пулах, но это эквивалентно сравнению моделей
    Сравнение идет по
    Результаты сравнения записываются в контекст

    **Необходимые ресурсы**

    * **matrixnet first test results** - архив с файлами предсказаний для различных тестов в ревизии N. ресурс типа MATRIXNET_ALLTESTS_RESULTS
    * **matrixnet second test results** - архив с файлами предсказаний для различных тестов в ревизии M. ресурс типа MATRIXNET_ALLTESTS_RESULTS
    """
    type = 'CHECK_MATRIXNET_PREDICTIONS_DIFF'

    input_parameters = (
        MatrixnetFirstTestParameter,
        MatrixnetSecondTestParameter,
        MatrixnetTestInternalFilesPairs,
    )

    def check_two_files(self, file1, file2):
        import numpy
        diffs = []
        stds1 = []
        stds2 = []
        file_key = file1 + '\t' + file2
        with open(file1, 'r') as f1, open(file2, 'r') as f2:
            for line1 in f1:
                line2 = f2.readline()
                if not line2:
                    break
                line1 = line1.strip().split('\t')
                line2 = line2.strip().split('\t')
                diffs.append(float(line1[-2]) - float(line2[-2]))
                stds1.append(float(line1[-1]))
                stds2.append(float(line2[-1]))

        mean_diff = numpy.mean(diffs)
        std_diff = numpy.std(diffs, ddof=1)
        stds1_mean = numpy.mean(stds1)
        stds2_mean = numpy.mean(stds2)
        dbg_str = "(mean differenece between predictions: {}" \
                  " std: {}" \
                  " mean std for file1: {}" \
                  " mean std for file2: {})".format(mean_diff, std_diff, stds1_mean, stds2_mean)

        if abs(mean_diff) < (stds1_mean + stds2_mean):
            self.ctx['file_diffs'][file_key] = (False, dbg_str)
        else:
            self.ctx['mx_diff'] = True
            self.ctx['file_diffs'][file_key] = (True, dbg_str)

    def check_files_in_dirs(self, dir1, dir2):
        for file_from1 in os.listdir(dir1):
            if os.path.isfile(os.path.join(dir1, file_from1)) and os.path.isfile(os.path.join(dir2, file_from1)):
                self.check_two_files(os.path.join(dir1, file_from1), os.path.join(dir2, file_from1))

    def print_tests_diff(self):
        with open('tests.diff', 'w') as diff:
            diff.write('(file1)\t(file2)\t(has_diff)\n')
            for files, res in self.ctx['file_diffs'].items():
                diff.write('{}\t{}\t{}\n'.format(files, res[0], res[1]))

    def on_execute(self):
        self.ctx['file_diffs'] = {}
        self.ctx['mx_diff'] = False
        os.mkdir('alltests_first')
        os.mkdir('alltests_second')
        run_process('tar -C alltests_first -zxf %s --strip-components=1' % self.sync_resource(self.ctx['matrixnet_first_test_resource_id']), log_prefix='extract_first')

        if 'matrixnet_second_test_resource_id' in self.ctx and self.ctx['matrixnet_second_test_resource_id']:
            run_process('tar -C alltests_second -zxf %s --strip-components=1' % self.sync_resource(self.ctx['matrixnet_second_test_resource_id']), log_prefix='extract_second')
            self.check_files_in_dirs('alltests_first', 'alltests_second')

        if self.ctx.get('matrixnet_test_files_pairs'):
            for pairsline in self.ctx['matrixnet_test_files_pairs'].split('\n'):
                pair = pairsline.split(';')
                self.check_two_files(os.path.join('alltests_first', pair[0]), os.path.join('alltests_first', pair[1]))

        self.print_tests_diff()
        tests_diff_resource = self.create_resource('tests.diff', 'tests.diff', resource_types.MATRIXNET_ALLTESTS_DIFF)
        self.ctx['matrixnet_tests_diff_resource_id'] = tests_diff_resource.id


__Task__ = CheckMatrixnetPredictionsDifference
