#  -*- coding: utf-8 -*-

import os
import logging

from sandbox.sandboxsdk.process import run_process
from sandbox.projects.common import apihelpers
from sandbox.sandboxsdk.errors import SandboxTaskFailureError
import sandbox.common.types.misc as ctm
import sandbox.common.types.client as ctc
import sandbox.projects.TestReportUnit as Unit
from sandbox.projects.common.environments import SandboxMapReduceEnvironment
import sandbox.projects.report.common as report_common


class ReportDataRuntimeRT(Unit.TestReportUnit):
    """
       Создает сорсы для модуля Util::Random::Text в mapreduce,
       которые будут использваться в задаче DATA_RUNTIME_ITEM при создании данных для модуля Util::Random::Text
    """

    type = 'REPORT_DATA_RUNTIME_RT'
    dns = ctm.DnsType.DNS64
    # Limit to precise to avoid GLIBC_2.14 linking bug in valgrind-mode executables
    client_tags = ctc.Tag.LINUX_PRECISE | ctc.Tag.LINUX_TRUSTY
    environment = (
        SandboxMapReduceEnvironment(version=1894459),
    )

    execution_space = 1000

    input_parameters = []
    max_source_lines = 200000

    def on_execute(self):
        self.report_path = 'report'
        self.apache_path = 'apache_bundle'

        attrs = {"released": "stable"}
        resource = apihelpers.get_last_resource_with_attrs(report_common.ApacheBundleParameter.resource_type, attrs, all_attrs=True)
        if not resource:
            raise SandboxTaskFailureError("Can not find apache")

        self.ctx.update({
            Unit.ArcadiaUrl.name: Unit.ArcadiaUrl.default_value,
            Unit.Selector.name: 'svn',
            report_common.ApacheBundleParameter.name: resource.id
        })
        self.get_apache(resource.id, self.apache_path)
        self.get_report(self, self.report_path)

        os.environ.update({"MR_USER": "tmp", "DEF_MR_SERVER": "sakura.search.yandex.net"})
        src_table = self.run_and_read("mapreduce -list -prefix reqans_log/ | grep -Po '^reqans_log/\d{8}$' | sort | tail -n1", self.report_path).strip()
        logging.info(src_table)

        # grep
        prefix = 'report/data_runtime/util/random_text'
        prefix_tmp = 'report/data_runtime/util/random_text/wc'

        cmd = "mapreduce -src %s -dst %s -dst %s " % (src_table, os.path.join(prefix_tmp, "last_req"), os.path.join(prefix_tmp, "tld"))
        cmd += "-map 'perl -MR13n randomtext.pl' -file ./scripts/dev/randomtext.pl -file ./lib/YxWeb/Util/R13n.pm"
        run_process([cmd], work_dir=self.report_path, timeout=3600, shell=True, check=True, log_prefix='mr', outputs_to_one_file=True)

        # uniq tld - don't use mr_uniq because of table has many chunks, so uniq manualy
        cmd = "mapreduce -read %s" % os.path.join(prefix_tmp, "tld")
        cmd += r" | perl -e 'my %h; while(<>){(my $tld) = split(/\t/, $_); $h{$tld}=$_;} map {print $_;} values %h'"
        cmd += " | mapreduce -write %s" % os.path.join(prefix_tmp, "tld")
        run_process([cmd], work_dir=self.report_path, timeout=600, shell=True, check=True, log_prefix='mr', outputs_to_one_file=True)

        # get all tld
        cmd = "mapreduce -read %s" % os.path.join(prefix_tmp, "tld")
        tld_data = [p.split('\t')[0] for p in self.run_and_read(cmd, self.report_path).splitlines()]
        logging.info(tld_data)

        # split by tld
        perl_code = 'my %hash = qw('
        for i in range(0, len(tld_data)):
            perl_code += " %s %s " % (tld_data[i], str(i))
        perl_code += ');'
        cmd = "mapreduce -src %s -map \"perl -e 'use strict; %s " % (os.path.join(prefix_tmp, "last_req"), perl_code)
        cmd += r"while (my \$line=<>) {my (\$key) = split(/\t/, \$line, 2); if (exists \$hash{\$key}) "
        cmd += r"{ print \$hash{\$key}, chr(0x0A); print \$line;} }'" + '"'
        cmd += ' '.join([" -dst %s " % os.path.join(prefix_tmp, tld) for tld in tld_data])
        logging.info(cmd)
        run_process([cmd], work_dir=self.report_path, timeout=3600, shell=True, check=True, log_prefix='mr', outputs_to_one_file=True)

        cmd = "mapreduce -drop %s" % os.path.join(prefix_tmp, 'out')
        run_process([cmd], work_dir=self.report_path, timeout=600, shell=True, check=True, log_prefix='mr', outputs_to_one_file=True)
        for tld in tld_data:
            cmd = "mapreduce -read %s -count %s " % (os.path.join(prefix_tmp, tld), self.max_source_lines)
            cmd += r"| awk -v FS='\t' -v OFS='\t' '{print $3, $4, $1, $2}' | sort -k3 | uniq -f2 | awk -v FS='\t' -v OFS='\t' '{ print $3, $4, $1, $2 }' "
            cmd += r"| mapreduce -write %s -append" % os.path.join(prefix_tmp, 'out')
            run_process([cmd], work_dir=self.report_path, timeout=600, shell=True, check=True, log_prefix='mr', outputs_to_one_file=True)

        cmd = "mapreduce -move -src %s -dst %s" % (os.path.join(prefix_tmp, 'out'), os.path.join(prefix, 'out'))
        run_process([cmd], work_dir=self.report_path, timeout=600, shell=True, check=True, log_prefix='mr', outputs_to_one_file=True)

    def cleanup(self):
        super(Unit.TestReportUnit, self).cleanup()


__Task__ = ReportDataRuntimeRT
