import traceback
import logging
from sandbox.sandboxsdk.paths import get_unique_file_name
from sandbox.sandboxsdk.task import SandboxTask
from sandbox.sandboxsdk.parameters import (
    SandboxStringParameter,
    SandboxIntegerParameter,
)
from sandbox.projects.common.binaries_provider_ymake import BinariesProvider, RequiredBinary as reqbin
from sandbox.sandboxsdk.process import run_process
import json


class MrServerParameter(SandboxStringParameter):
    """
        Mapreduce cluster to use
    """
    name = 'mr_server'
    description = 'Mapreduce server ("local" in a locally set up MR microcluster):'
    default_value = 'sakura'
    required = True


class MrTableParameter(SandboxStringParameter):
    """
        Mapreduce table to read
    """
    name = "mr_table"
    description = 'Mr table for export'
    default_value = "ppb/musicians/top_musician"


class LinkCountParameter(SandboxIntegerParameter):
    """
        Min value for total number of links to the musisian to be included in output result
    """
    name = "link_count"
    description = "Min number of links"
    required = False


class ColumnsListParameter(SandboxStringParameter):
    """
        Comma separated list of social networks
    """
    name = "network_list"
    description = "Network list"
    default_value = "vk,twitter,lj,diary,li,other"


binaries = [
    reqbin("quality/mapreduce", "mapreduce"),
]
binaries_provider = BinariesProvider(binaries, 'Building binaries to compare MR tables')


class TopMusicians(SandboxTask):
    """
        Convert top_musciains MR table to sandbox resource
    """
    type = "TOP_MUSICIANS_FROM_PPB_POSTLOG"
    input_parameters = [
        MrServerParameter,
        MrTableParameter,
        LinkCountParameter,
        ColumnsListParameter
    ] + binaries_provider.task_parameters()

    def on_enqueue(self):
        SandboxTask.on_enqueue(self)
        binaries_provider.run_build_task(self)

    def on_execute(self):
        networks = self.ctx.get("network_list").split(",")
        binaries_provider.join_build_task(self)
        binaries_provider.fetch_prebuild_binaries(self)
        fname = get_unique_file_name(".", "top_musicians")
        cmd = 'bin/mapreduce -server %s -subkey -read %s > %s' \
            % (self.ctx.get("mr_server"), self.ctx.get("mr_table"), fname)
        run_process(cmd, log_prefix='mapreduce-read', shell=True, wait=True, check=True)
        musicians = []
        for line in open(fname, 'r'):
            try:
                name, entity_key, statJson = line.rstrip("\n").split("\t")
                # musician = json.loads(statJson.decode('utf-8'))
                musician = json.loads(statJson)
                if musician["total"] < self.ctx.get("link_count"):
                    next
                musician["name"] = name or "UNKNOWN"
                musician["entity_key"] = entity_key
                musicians.append(musician)
            except:
                logging.warn("failed to parse raw %s" % line.rstrip("\n"))
        musicians = sorted(musicians, key=lambda musician: musician["total"], reverse=True)
        f = open(fname, 'w')
        columns = ["name", "entity_search_key", "category", "total"]
        f.write("\t".join(columns + networks)+"\n")
        for musician in musicians:
            try:
                line = [musician.get("name"), musician.get("entity_key"), ",".join(musician.get("category")), musician.get("total")]
                for network in networks:
                    line.append(musician.get(network) or "0")
                f.write("\t".join([str(x) for x in line]) + "\n")
            except:
                logging.warn("bad object %s" % musician)
                logging.warn(traceback.format_exc())
        f.close()

        self.create_resource(
            "top muscians list",
            fname,
            "TOP_MUSCIANS_LIST",
            arch="any",
        )


__Task__ = TopMusicians
