# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import logging
import time
import re

try:
    from yt_worker import YtWorker, yt
except ImportError:
    from .yt_worker import YtWorker, yt

import yt.yson as yson


logger = logging.getLogger(__name__)


class YtCopyCommandAnalysis(YtWorker):
    def __init__(self, token, clusters_for_analysis=None):

        super(YtCopyCommandAnalysis, self).__init__(
            token=token, source_path="//home/acl_dump")

        if clusters_for_analysis is None:
            self.clusters_for_analysis = ["hahn"]
        elif isinstance(clusters_for_analysis, list):
            if all(isinstance(value, unicode) for value in clusters_for_analysis):
                self.clusters_for_analysis = clusters_for_analysis
            else:
                raise ValueError("Invalid cluster value type")
        else:
            raise TypeError("clusters_for_analysis must be list!")

        self.result_path = self.analysis_folder + "/user-copy-acl-abuse"
        self.result_table = None
        self.result_tables_paths = list()
        self.yt.create("map_node", self.result_path,
                       recursive=True, ignore_existing=True)
        self.current_cluster = None

    @staticmethod
    def _get_current_date(delim=False):
        from datetime import date
        if not delim:
            today = date.today().strftime("%Y%m%d")
        else:
            today = date.today().strftime("%Y-%m-%d")
        return today

    def _prepare_private_tables_paths(self):
        self.__private_tables_by_cluster = dict()
        for cluster in self.clusters_for_analysis:
            source_table = [self.source_path, "/", cluster,
                            "/", cluster, ".", self._get_current_date()]
            source_table = "".join(source_table)
            try:
                self.__private_tables_by_cluster[cluster] = list(self.yt.read_table(
                    self.yt.TablePath(source_table, columns=["path"])))
            except Exception as exc:
                logger.error(
                    "Error while trying to read ACL dumps for cluster %s", cluster)
                raise RuntimeError(exc)

    @yt.aggregator
    def _map_users_run_copy(self, records):
        mapped_record = dict()
        for record in records:
            cluster = record.get("cluster")
            command = record.get("command")
            done_status = record.get("done_status")
            parameters = record.get("parameters")
            source_path = None
            destination_path = None

            if (cluster in self.current_cluster and command == "copy" and
                    done_status == "True" and parameters is not None):
                destination_path_pattern = re.search(
                    "\"destination_path\"=(.+?)(//[^\"]+)", str(parameters))
                source_path_pattern = re.search(
                    "\"source_path\"=(.+?)(//[^\"]+)", str(parameters))

                if destination_path_pattern is not None:
                    destination_path = destination_path_pattern.group(2)
                if source_path_pattern is not None:
                    source_path = source_path_pattern.group(2)

                user = record.get("user")
                request_id = record.get("request_id")

                mapped_record.update({"user": user,
                                      "cluster": cluster,
                                      "source_path": source_path,
                                      "destination_path": destination_path,
                                      "request_id": request_id})
                yield mapped_record

    def _reduce_users_run_copy(self, key, records):
        source_path = key["source_path"]
        destination_path = key["destination_path"]
        user = key["user"]
        reduced_record = dict()
        reduced_record["user"] = user
        reduced_record["path"] = source_path
        reduced_record["destination_path"] = destination_path
        reduced_record["request_id"] = list()
        for record in records:
            request_id = record.get("request_id")

            if request_id is not None:
                reduced_record["request_id"].append(request_id)

        reduced_record["request_id"] = str(reduced_record["request_id"])
        yield reduced_record

    def _reduce_with_dump_src(self, key, records):
        source_path = key["path"]
        reduced_record = dict()
        reduced_record["source_path"] = source_path
        reduced_record["src_is_private"] = False
        for record in records:
            effective_acl = record.get("effective_acl")
            user = record.get("user")
            destination_path = record.get("destination_path")
            request_id = record.get("request_id")

            if effective_acl is not None:
                reduced_record["src_is_private"] = True

            if user is not None:
                reduced_record["user"] = user

            if destination_path is not None:
                reduced_record["path"] = destination_path

            if request_id is not None:
                reduced_record["request_id"] = request_id

        if reduced_record["src_is_private"]:
            yield reduced_record

    def _reduce_with_dump_dest(self, key, records):
        destination_path = key["path"]
        reduced_record = dict()
        reduced_record["destination_path"] = destination_path
        reduced_record["dest_is_private"] = False
        for record in records:
            effective_acl = record.get("effective_acl")
            user = record.get("user")
            source_path = record.get("source_path")
            request_id = record.get("request_id")
            src_is_private = record.get("src_is_private")

            if effective_acl is not None:
                reduced_record["dest_is_private"] = True

            if user is not None:
                reduced_record["user"] = user

            if source_path is not None:
                reduced_record["source_path"] = source_path

            if request_id is not None:
                reduced_record["request_id"] = request_id

            if src_is_private is not None:
                reduced_record["src_is_private"] = src_is_private

        if (not reduced_record["dest_is_private"] and reduced_record["src_is_private"] and
                reduced_record["destination_path"] is not None):
            yield reduced_record

    def _check_destination_table_attributes(self, table_format=yt.YsonFormat()):
        result_rows = list()
        rows = self.yt.read_table(self.result_table, format=table_format)
        rows_list = list(rows)

        proxy = "".join([self.current_cluster, ".yt.yandex.net"])

        yt_client = self.yt.YtClient(proxy=proxy, token=self.yt_token)
        batch_client = yt_client.create_batch_client()

        batch_results = [(row, batch_client.get(path=row["destination_path"] + "/@",
                                                attributes=["effective_acl"])) for row in rows_list]

        logger.info("Committing the batch ...")
        time_start = time.time()
        batch_client.commit_batch()
        time_delta = "{:.2f}s".format(time.time() - time_start)
        logger.info("Done in %s", time_delta)

        for row, result in batch_results:
            effective_acl = result.get_result()
            table_is_open = False

            if effective_acl is None:
                continue
            else:
                for line in effective_acl["effective_acl"]:
                    if "yandex" in line["subjects"] and len(line["permissions"]) > 0:
                        table_is_open = True
                        break

                if table_is_open:
                    result_rows.append(row)

        self.yt.write_table(self.result_table, result_rows,
                            format=table_format)

    @staticmethod
    def schema_mapper(records):
        records["request_id"] = str(records["request_id"])
        records["dest_is_private"] = str(records["dest_is_private"])
        records["src_is_private"] = str(records["src_is_private"])

        yield records

    @staticmethod
    def prepare_schema(columns=None, type="string"):
        schema = list()
        for column in columns:
            schema.append({"name": column, "type": type})

        schema = yson.YsonList(schema)
        schema.attributes["schema"] = False
        return schema

    def run_analysis(self):
        if self.yt.exists(self.enrich_table):

            # Map
            for cluster in self.clusters_for_analysis:
                self.result_table = "".join(
                    [self.result_path, "/", cluster, "_copy_acl_abuse"])
                self.current_cluster = cluster

                if self.yt.exists(self.result_table):
                    logging.warning(
                        "Result table %s already exists, removing ...", self.result_table)
                    self.yt.remove(self.result_table)

                dump_table = [self.source_path, "/", cluster,
                              "/", cluster, ".", self._get_current_date()]
                dump_table = "".join(dump_table)

                logger.info(
                    "Running map for cluster %s, result table: %s ...", cluster, self.result_table)
                self.yt.run_map(self._map_users_run_copy,
                                self.enrich_table, self.result_table)

                logger.info(
                    "Running sort by user, source path and dest path...")
                self.yt.run_sort(self.result_table, sort_by=[
                                 "user", "source_path", "destination_path"])

                logger.info("Running reduce by user and source path ...")
                self.yt.run_reduce(self._reduce_users_run_copy, self.result_table, self.result_table,
                                   reduce_by=["user", "source_path", "destination_path"])

                logger.info(
                    "Sorting reduce result by renamed column (source_path -> path) ...")
                self.yt.run_sort(self.result_table, sort_by="path")

                logger.info(
                    "Running first join via reduce: py source_path as path ...")
                self.yt.run_reduce(self._reduce_with_dump_src, [self.result_table, dump_table], self.result_table,
                                   reduce_by="path")

                logger.info(
                    "Sorting again for the second join reduce by path ...")
                self.yt.run_sort(self.result_table, sort_by="path")

                logger.info(
                    "Running second join via reduce py destination_path as path")
                self.yt.run_reduce(self._reduce_with_dump_dest, [self.result_table, dump_table], self.result_table,
                                   reduce_by="path")

                # Set schema for result table
                schema = self.prepare_schema(columns=["request_id", "user", "source_path", "destination_path",
                                                      "src_is_private", "dest_is_private"])

                dest_table = "<schema={0}>" + self.result_table

                logger.info(
                    "Running map for schematizing result enrich table ...")
                self.yt.run_map(self.schema_mapper, self.result_table,
                                dest_table.format(self.yt.yson.dumps(schema)))

                # Additional check -- check if dest table exists now and is open for subject yandex
                self._check_destination_table_attributes()

                self.result_tables_paths.append(self.result_table)
        else:
            raise RuntimeError("Enrich table not exist!")

        return True


def main():

    analyze = YtCopyCommandAnalysis(token=None)
    analyze.run_analysis()


if __name__ == "__main__":
    main()
