import logging
import os
import urllib2
import re
from sandbox import sdk2
from sandbox.sandboxsdk import environments
from sandbox.sandboxsdk.process import run_process
from sandbox.sandboxsdk.paths import make_folder
from sandbox.sandboxsdk.parameters import SandboxStringParameter
from sandbox.common.utils import gzip_file
from sandbox.projects.common.utils import get_or_default
from sandbox.projects.geosuggest.component import GeoSuggestDaemonWrapper, GeoSuggestDaemonParameter, GeoSuggestDataParameter
from sandbox.projects.geosuggest.resources import MAPS_GEO_SUGGEST_RANKING_POOL
from sandbox.projects.geosuggest.common.qp import GeoSuggestQPTask
from sandbox.projects.geosuggest.common.utils import get_or_default_with_ctx_update, get_yt_table_info
from sandbox.projects.geosuggest.common.parameters import GeoSuggestSandboxBinParameter, GeoSuggestTrainingPoolTableParameter, SandboxVaultOwner, SandboxVaultYtTokenName


logger = logging.getLogger(__name__)


def cleanup_spaces(s):
    return re.sub(r"\s+", ' ', s)


class PoolTypes(object):
    TYPES = {
        "basic": {},
        "myak": {"additional_arguments": ["-p", "mobile=1"]},
    }

    @classmethod
    def get_default_type(cls):
        return "myak"

    @classmethod
    def get_types(cls):
        return [pool_type for pool_type in cls.TYPES]

    @classmethod
    def get_additional_arguments(cls, pool_type):
        return cls.TYPES[pool_type].get("additional_arguments", [])

    @classmethod
    def get_description(cls, pool_type):
        description = pool_type
        args = cls.get_additional_arguments(pool_type)
        if args:
            description += " (" + " ".join(args) + ")"
        return description


class GeoSuggestRankingPoolType(SandboxStringParameter):
    name = "geosuggest_ranking_pool_type"
    description = "Ranking pool type"
    choices = [(PoolTypes.get_description(key), key) for key in PoolTypes.get_types()]
    default_value = PoolTypes.get_default_type()


class GeoSuggestPrepareRankingPool(GeoSuggestQPTask):
    type = "GEO_SUGGEST_PREPARE_RANKING_POOL"

    input_parameters = [
        GeoSuggestTrainingPoolTableParameter,
        GeoSuggestRankingPoolType,
        GeoSuggestDaemonParameter,
        GeoSuggestDataParameter,
        GeoSuggestSandboxBinParameter,
        SandboxVaultOwner,
        SandboxVaultYtTokenName,
    ]

    environment = [
        environments.PipEnvironment("yandex-yt"),
    ]

    TIMEOUT = 5 * 3600
    execution_space = 100 * 1024  # 100 Gb
    required_ram = 80 * 1024  # 80 Gb

    def get_yt_token(self):
        vault_owner = get_or_default(self.ctx, SandboxVaultOwner)
        vault_name = get_or_default(self.ctx, SandboxVaultYtTokenName)
        return self.get_vault_data(vault_owner, vault_name)

    def read_clicks(self, output_filepath):
        from yt.wrapper import YtClient, TablePath, DsvFormat

        table_info = get_yt_table_info(self.ctx, GeoSuggestTrainingPoolTableParameter)
        logging.info("read_clicks: table_info:\n %s", "{}".format(table_info))
        table_path = table_info["table_path"]  # Example: table_path=//home/qreg/amulenkov/myak_training_pool_2018_08_16[:#2000]
        columns = table_info["columns"]  # Example: columns=timestamp;ll;spn;ull;platform;prefix;stype;text;what_id;what_name;where_id;where_name;lang
        self.ctx["training_table_path"] = table_path
        self.ctx["training_table_columns"] = columns

        yt_proxy = "hahn"
        yt_token = self.get_yt_token()
        client = YtClient(proxy=yt_proxy, token=yt_token)

        with open(output_filepath, "w") as output_file:
            columns_list = columns.split(";")
            for row in client.read_table(TablePath(table_path, columns=columns_list), format=DsvFormat()):
                line = "\t".join([cleanup_spaces(row[column]) for column in columns_list])
                line = line.replace(r"\\/", "/").replace(r"\/", "/")  # Temporary workaround for handling "\\/" and "\/" presence in training table
                output_file.write(line + "\n")

    def collect_pool(self, clicks_filepath, output_directory, factor_names_filepath):
        def start_daemon():
            daemon_path = self.sync_resource(get_or_default_with_ctx_update(self.ctx, GeoSuggestDaemonParameter))
            data_dir = self.sync_resource(get_or_default_with_ctx_update(self.ctx, GeoSuggestDataParameter))
            logs_dir = os.path.join(self.log_path(), "geosuggestd")

            return GeoSuggestDaemonWrapper(
                daemon_path=sdk2.path.Path(daemon_path),
                data_dir=sdk2.path.Path(data_dir),
                logs_dir=sdk2.path.Path(logs_dir)
            )

        def get_factors_list(geosuggestd_url):
            response = urllib2.urlopen(geosuggestd_url + "/get-factors-list?format=Index_Name")
            return response.read()

        geo_suggest_sandbox_bin_path = self.sync_resource(get_or_default_with_ctx_update(self.ctx, GeoSuggestSandboxBinParameter))
        pool_type = get_or_default_with_ctx_update(self.ctx, GeoSuggestRankingPoolType)

        with start_daemon() as geosuggestd:
            logging.info("collect_pool: daemon launched at port %s", geosuggestd.port)
            geosuggestd_url = "http://localhost:{0}".format(geosuggestd.port)

            logging.info("collect_pool: acquiring factor names list into %s...", factor_names_filepath)
            factors_list = get_factors_list(geosuggestd_url)
            with open(factor_names_filepath, "w") as factor_names_file:
                factor_names_file.write(factors_list)

            logging.info("collect_pool: acquiring data for pool with type '%s' into %s...", pool_type, output_directory)
            args = [
                os.path.join(geo_suggest_sandbox_bin_path, "prepare_ranking_pool"),
                "-i", clicks_filepath,
                "-u", geosuggestd_url,
                "-p", "timeout=1000",
                "-o", output_directory,
            ]
            args.extend(PoolTypes.get_additional_arguments(pool_type))
            run_process(args, log_prefix="prepare_ranking_pool")

    def concatenate_files(self, input_directory, output_filepath):
        with open(output_filepath, "w") as output_file:
            for name in os.listdir(input_directory):
                input_filepath = os.path.join(input_directory, name)
                if os.path.isfile(input_filepath):
                    logging.info("concatenate_files: %s -> %s", input_filepath, output_filepath)
                    with open(input_filepath, "r") as input_file:
                        for line in input_file:
                            output_file.write(line)

    def convert_to_fml_pool(self, raw_features_filepath, fml_pool_filepath):
        first_line_columns_count = None
        line_index = 0
        with open(fml_pool_filepath, "w") as output_file:
            with open(raw_features_filepath, "r") as input_file:
                for line in input_file:
                    fields = line.strip().split("\t")
                    if line_index == 0:
                        first_line_columns_count = len(fields)
                    if len(fields) != first_line_columns_count:
                        raise ValueError("Columns count in line {} differs from columns count in first line: {} vs {}\nLine:\n{}".format(
                            line_index + 1,
                            len(fields),
                            first_line_columns_count,
                            line))
                    if len(fields) < 11:
                        raise ValueError("Columns count in line {} is less than required minimum".format(line_index + 1))
                    # The same as: awk -F "\t" '{print $2"\t"$9"\t"$8"\t"$11"\t"$0}' | cut -f 1-4,20-
                    rearranged_fields = [
                        fields[1],
                        fields[8],
                        fields[7],
                        fields[10],
                    ] + fields[15:]
                    output_file.write("\t".join(rearranged_fields) + "\n")
                    line_index += 1

    def on_execute(self):
        work_dir = os.path.join(self.abs_path(), "artefacts")
        make_folder(work_dir)

        logger.info("Reading clicks...")
        clicks_dir = os.path.join(work_dir, "1_training_data")
        make_folder(clicks_dir)
        clicks_filepath = os.path.join(clicks_dir, "clicks.tsv")
        self.read_clicks(clicks_filepath)

        logger.info("Collecting pool...")
        split_features_dir = os.path.join(work_dir, "2_pool_split")
        final_result_dir = os.path.join(work_dir, "5_final_result", "pool_fml")
        factor_names_filepath = os.path.join(final_result_dir, "factor_names.txt")
        make_folder(split_features_dir)
        make_folder(final_result_dir)
        self.collect_pool(clicks_filepath, split_features_dir, factor_names_filepath)

        logger.info("Joining features files...")
        joined_features_dir = os.path.join(work_dir, "3_pool_combined")
        make_folder(joined_features_dir)
        joined_features_filepath = os.path.join(joined_features_dir, "features.tsv")
        self.concatenate_files(split_features_dir, joined_features_filepath)

        logger.info("Converting to FML pool format...")
        fml_dir = os.path.join(work_dir, "4_pool_for_fml_uncompressed")
        make_folder(fml_dir)
        fml_pool_filepath = os.path.join(fml_dir, "pool.tsv")
        self.convert_to_fml_pool(joined_features_filepath, fml_pool_filepath)

        logger.info("Compressing FML pool...")
        fml_pool_compressed_filepath = os.path.join(final_result_dir, "features.tsv.gz")
        gzip_file(fml_pool_filepath, fml_pool_compressed_filepath)

        logger.info("Publishing results...")
        resource_attributes = {}
        resource = self.create_resource(
            "Evaluation pool for FML. GeoSuggest daemon: #{}. GeoSuggest data: #{}. Training table: {}.".format(
                self.ctx[GeoSuggestDaemonParameter.name],
                self.ctx[GeoSuggestDataParameter.name],
                self.ctx["training_table_path"]),
            final_result_dir,
            MAPS_GEO_SUGGEST_RANKING_POOL,
            attributes=resource_attributes)
        self.mark_resource_ready(resource.id)

        logger.info("Finished")


__Task__ = GeoSuggestPrepareRankingPool
