from __future__ import print_function
import itertools
import logging
import re
import sys
import types
from collections import defaultdict

import yt.wrapper as yt

from crypta.graph.v1.python.infra import merge_graph_tables
from crypta.graph.v1.python.rtcconf import config
from crypta.graph.v1.python.utils import mr_utils as mr
from crypta.graph.v1.python.v2.soup import soup_dirs
from crypta.graph.v1.python.utils import yt_clients

POWER_LINE = "I really want it!"
logger = logging.getLogger("graph_postproc")

date_pattern = re.compile(r".*(?P<dt>\d{4}-\d{2}-\d{2}).*")


def date_from_table(table):
    match = date_pattern.match(table)
    if match is None:
        return ""  # empty string is non valid date table
    return match.groupdict().get("dt", "")


def clean_whole_dates_in_dir(folder, ndays, run_date):
    """
    :return: all date-folders in this folder older than ndays
    """

    def is_valid_table(table):
        date = date_from_table(table)
        return bool(date) and ((run_date is None) or (date <= run_date))

    folder = folder.rstrip("/")
    yt_client = yt_clients.get_yt_client()
    if yt_client.exists(folder):
        dts = sorted(filter(is_valid_table, yt_client.list(folder)), key=date_from_table, reverse=True)
        dts_grouped = list(map(lambda pair: (pair[0], list(pair[1])), itertools.groupby(dts, key=date_from_table)))
        dts_grouped_cutted = dts_grouped[ndays:]

        for date, group in dts_grouped_cutted:
            for table in group:
                yield folder + "/" + table


def clean_date_file_or_dir(folder, table_or_dir, ndays, run_date):
    """
    :return: every table_or_dir from date-folders older than ndays
    """
    yt_client = yt_clients.get_yt_client()
    if not yt_client.exists(folder.rstrip("/")):
        return
    to_stay_tables = set(mr.get_date_tables(folder, table_or_dir, ndays, before_date=run_date))
    all_tables = set(mr.get_date_tables(folder, table_or_dir, 1000, before_date=run_date))
    remove_tables = sorted(list(all_tables.difference(to_stay_tables)))

    for table_or_dir in remove_tables:
        if table_or_dir.endswith("/") or yt_client.exists(table_or_dir):
            yield table_or_dir


def drop_date_folders(folder, ndays):
    """
    :return: every table_or_dir from date-folders older than ndays
    """

    def have_date_in_name(x):
        return any(x.count("%s-" % year) for year in xrange(2015, 2100))

    yt_client = yt_clients.get_yt_client()
    all_date_nodenames = filter(have_date_in_name, yt_client.list(folder))
    date_dict = defaultdict(list)
    for node in all_date_nodenames:
        key = node.__getslice__(*re.search(r"\d{4}-\d{2}-\d{2}", node).regs[0])
        date_dict[key].append(folder + "/" + node)
    nodes_to_delete = sum((x[1] for x in sorted(date_dict.iteritems(), key=lambda x: x[0], reverse=True)[ndays:]), [])
    for table_or_dir in nodes_to_delete:
        if yt_client.exists(table_or_dir):
            yield table_or_dir
        # else:
        #     logger.info('Unexisting table %s',table_or_dir)


def remove_all(tbls):
    for t in tbls:
        try:
            logger.info("Removing %s: started", t)
            mr.drop(t)
        except:
            logger.error("Cannot remove %s", t)
        else:
            logger.info("Removing %s: removing done.", t)


def prepare_v2_tables(run_date, days, debug_days, tmp_days, history_days):
    crypta_home_dir = config.CRYPTA_GRAPH_CRYPTA_HOME

    radius_metrics = [
        clean_date_file_or_dir(config.RADIUS_METRICS_YT_FOLDER, "yuid_rlogin/yuid_ua_tmp", debug_days, run_date),
        clean_whole_dates_in_dir(config.RADIUS_LOG_YT_FOLDER, days, run_date),
        clean_whole_dates_in_dir(config.RADIUS_METRICS_YT_FOLDER, days, run_date),
    ]

    v2_stuff = [
        clean_whole_dates_in_dir(soup_dirs.SOUP_DAY_DIR, debug_days, run_date),
        clean_whole_dates_in_dir(soup_dirs.SOUP_DAY_LOGS_DIR, debug_days, run_date),
    ]

    shared_ids = [
        clean_whole_dates_in_dir(config.CRYPTA_SHARED_IDS_FOLDER + "heuristic_desktop_yuids", days, run_date),
        clean_whole_dates_in_dir(config.CRYPTA_SHARED_IDS_FOLDER + "merged", days, run_date),
        clean_whole_dates_in_dir(config.CRYPTA_SHARED_IDS_FOLDER + "yandex_drive", days, run_date),
    ]

    stuff = radius_metrics + v2_stuff + shared_ids

    stuff.extend(
        [
            # webvisor
            clean_whole_dates_in_dir(crypta_home_dir + "/state", days, run_date),
            clean_whole_dates_in_dir(crypta_home_dir + "/state/webvisor_processed", days, run_date),
            clean_whole_dates_in_dir(crypta_home_dir + "/state/webvisor_processed/stats", days, run_date),
        ]
    )

    return stuff


def prepare_v1_tables(run_date, days, debug_days, tmp_days, history_days):
    v1_graph_dir = config.YT_OUTPUT_FOLDER
    v1_indevice_dir = config.INDEVICE_YT_FOLDER
    v1_idserv_dir = config.IS_OUTPUT_FOLDER
    v1_vertices_radius_metrics_dir = config.RADIUS_METRICS_YT_FOLDER

    # TODO: vertices configs must be here!
    radius_vertices_types = {
        "exact": ["", "cluster", "cluster_experiment"],
        "fuzzy": ["", "cluster"],
        "no_login": ["cluster"],
        "v2": [""],
    }
    # separate variable just because of stupid tmp name
    vertices_folders = ["exact", "tmp", "no_login", "v2"]

    radius = []
    for vertices_type, clustering_types in radius_vertices_types.iteritems():
        for clustering_type in clustering_types:
            postfix = "_" + clustering_type if clustering_type else ""
            radius.append(
                clean_date_file_or_dir(v1_vertices_radius_metrics_dir, vertices_type + postfix, debug_days, run_date)
            )
    vertices = []
    # vertices: all useful tables are copied in GraphHistorySnapshot so we can cleanup here

    for vertices_folder in vertices_folders:
        vertices.append(clean_date_file_or_dir(v1_graph_dir, vertices_folder, debug_days, run_date))

    stuff = radius + vertices

    stuff.extend(
        [
            # debug cleanup: keeps only last n days of specified folders to allow debug
            clean_date_file_or_dir(v1_graph_dir, "raw_links/yuid_ip_ts", tmp_days, run_date),
            clean_date_file_or_dir(v1_graph_dir, "raw_links/bad_ua", tmp_days, run_date),
            clean_date_file_or_dir(v1_graph_dir, "raw_links/bad_ua_access", tmp_days, run_date),
            clean_date_file_or_dir(v1_graph_dir, "raw_links/yuid_ip_ts", tmp_days, run_date),
            clean_date_file_or_dir(v1_graph_dir, "yuid_raw/ui_yuid_all", tmp_days, run_date),
            clean_date_file_or_dir(v1_graph_dir, "yuid_apps", tmp_days, run_date),
            clean_date_file_or_dir(v1_graph_dir, "mobile/dev_info", debug_days, run_date),
            clean_date_file_or_dir(v1_graph_dir, "mobile/uuid_info", debug_days, run_date),
            clean_date_file_or_dir(v1_graph_dir, "mobile/dev_uuid_indevice_perfect_no_limit", tmp_days, run_date),
            clean_date_file_or_dir(v1_graph_dir, "mobile/uuid_dev_no_limit_extended", tmp_days, run_date),
            clean_date_file_or_dir(v1_graph_dir, "mobile/dev_uuid_indevice_perfect_no_limit_tmp", tmp_days, run_date),
            clean_date_file_or_dir(v1_graph_dir, "mobile/account_manager/account_manager_dict", debug_days, run_date),
            clean_date_file_or_dir(v1_graph_dir, "mobile/app_stats/apps_day", debug_days, run_date),
            clean_date_file_or_dir(
                v1_indevice_dir, "perfect/dev_yuid_indevice_perfect_no_limit", debug_days, run_date
            ),
            clean_date_file_or_dir(
                v1_indevice_dir, "perfect/dev_yuid_indevice_unperfect_no_limit", debug_days, run_date
            ),
            clean_date_file_or_dir(v1_indevice_dir, "perfect/devid_raw_month", debug_days, run_date),
            clean_date_file_or_dir(v1_indevice_dir, "perfect/devid_yuid_all", debug_days, run_date),
            clean_date_file_or_dir(v1_indevice_dir, "perfect/app_stats/month/apps", debug_days, run_date),
            clean_date_file_or_dir(v1_indevice_dir, "perfect/app_stats/month/uuid_tmp", debug_days, run_date),
            clean_date_file_or_dir(v1_graph_dir, "stat_new/", debug_days, run_date),
            clean_date_file_or_dir(v1_graph_dir, "upload_bb/", debug_days, run_date),
            clean_date_file_or_dir(v1_graph_dir, "pairs/", debug_days, run_date),
            clean_date_file_or_dir(v1_idserv_dir, "mapping_cryptaid_tmp", debug_days, run_date),
            # final month cleanup: keeps only last month of main dirs
            clean_whole_dates_in_dir(v1_graph_dir, days, run_date),
            clean_whole_dates_in_dir(v1_idserv_dir, days, run_date),
            clean_whole_dates_in_dir(v1_indevice_dir, days, run_date),
            clean_date_file_or_dir(v1_graph_dir + "history/", "fuzzy_vertices", history_days, run_date),
        ]
    )
    return stuff


def remove_stuff(to_remove, safe_mode):  # C901 # noqa
    has_lists = True
    # Multi-level flatten of 'stuff'
    while has_lists:
        new_to_remove = []
        has_lists = False
        for t in to_remove:
            if isinstance(t, (list, tuple, set, types.GeneratorType)):
                new_to_remove.extend(t)
                has_lists = True
            else:
                new_to_remove.append(t)
        to_remove = new_to_remove

    if safe_mode:
        print("Going to remove all these tables/directories:")
        yt_client = yt_clients.get_yt_client()
        to_remove = [t for t in to_remove if yt_client.exists(t[:-1] if t.endswith("/") else t)]
        for t in to_remove:
            tt = t
            if tt.endswith("/"):
                tt = tt[:-1]

            if yt_client.exists(tt):
                print("\t%s" % t)
        print('If you really want it, type "%s"' % POWER_LINE)
        sys.stdout.flush()
        line = sys.stdin.readline()
        line = line.strip()
        if line == POWER_LINE:
            print("You asked for it!")
            remove_all(to_remove)
        else:
            print("LOL, no")
    else:
        remove_all(to_remove)


def run_postproc(safe_mode=True, run_date=None, clean_v1=True, clean_v2=True):
    days = int(config.STORE_DAYS)
    history_days = days * 2

    if config.CRYPTA_ENV == "testing":
        debug_days = 1
        tmp_days = 1
    else:
        debug_days = 4  # may contain some useful info for debug
        tmp_days = 2  # requires only for single day of process, no useful debug info

    if clean_v1:
        tables = prepare_v1_tables(run_date, days, debug_days, tmp_days, history_days)
        remove_stuff(tables, safe_mode)

    if clean_v2:
        tables = prepare_v2_tables(run_date, days, debug_days, tmp_days, history_days)
        remove_stuff(tables, safe_mode)


if "__main__" == __name__:
    yt.config.set_proxy(config.MR_SERVER)

    logging.basicConfig(level="INFO")

    run_postproc(safe_mode=True)
    merge_graph_tables.merge_v1_dicts()
