#!/usr/bin/python -tt
# coding=utf-8
import logging
import re
import sys
import types
from collections import defaultdict

import yt.wrapper as yt

from infra import merge_graph_tables
from rtcconf import config
from utils import mr_utils as mr
from v2.soup import soup_config

__author__ = 'rodion'

POWER_LINE = "I really want it!"
logger = logging.getLogger('graph_postproc')


def clean_whole_dates_in_dir(folder, ndays):
    """
    :return: all date-folders in this folder older than ndays
    """
    date_pattern = re.compile('(\d{4}-\d{2}-\d{2})')
    if folder.endswith('/'):
        folder = folder[:-1]
    if yt.exists(folder):
        dts = yt.list(folder)
        dts = filter(lambda x: date_pattern.match(x), dts)
        if len(dts) > ndays:
            dts = sorted(dts, reverse=True)
            dts = dts[ndays:]
            for dt in dts:
                yield folder + '/' + dt


def clean_date_file_or_dir(folder, table_or_dir, ndays):
    """
    :return: every table_or_dir from date-folders older than ndays
    """
    to_stay_tables = set(mr.get_date_tables(folder, table_or_dir, ndays))
    all_tables = set(mr.get_date_tables(folder, table_or_dir, 1000))
    remove_tables = sorted(list(all_tables.difference(to_stay_tables)))

    for table_or_dir in remove_tables:
        if table_or_dir.endswith('/') or yt.exists(table_or_dir):
            yield table_or_dir


def drop_date_folders(folder, ndays):
    """
    :return: every table_or_dir from date-folders older than ndays
    """
    have_date_in_name = lambda x: any(x.count('%s-' % year) for year in xrange(2015, 2100))
    all_date_nodenames = filter(have_date_in_name, yt.list(folder))
    date_dict = defaultdict(list)
    for node in all_date_nodenames:
        key = node.__getslice__(*re.search('\d{4}-\d{2}-\d{2}', node).regs[0])
        date_dict[key].append(folder + '/' + node)
    nodes_to_delete = sum((x[1] for x in sorted(date_dict.iteritems(), key=lambda x: x[0], reverse=True)[ndays:]), [])
    for table_or_dir in nodes_to_delete:
        if yt.exists(table_or_dir):
            yield table_or_dir
        # else:
        #     logger.info('Unexisting table %s',table_or_dir)


def remove_all(tbls):
    for t in tbls:
        try:
            logger.info('Removing %s: started', t)
            mr.drop(t)
        except:
            logger.error('Cannot remove %s', t)
        else:
            logger.info('Removing %s: removing done.', t)


def run_postproc(safe_mode=True):
    days = int(config.STORE_DAYS)

    if config.CRYPTA_ENV == 'testing':
        debug_days = 1
        tmp_days = 1
        # we copy it ourselves in CopyFromProdTask, need to cleanup
        fingerprints = list(clean_whole_dates_in_dir(config.FP_FOLDER, days))
    else:
        debug_days = 4  # may contain some useful info for debug
        tmp_days = 2  # requires only for single day of process, no useful debug info
        fingerprints = []

    # TODO: vertices configs must be here!
    radius_vertices_types = {'exact': ['', 'cluster', 'cluster_experiment'],
                             'fuzzy': ['', 'cluster'],
                             'no_login': ['cluster'],
                             'v2': ['']}
    # separate variable just because of stupid tmp name
    vertices_folders = ['exact', 'tmp', 'no_login', 'v2']

    radius = []
    for vertices_type, clustering_types in radius_vertices_types.iteritems():
        for clustering_type in clustering_types:
            postfix = '_' + clustering_type if clustering_type else ''
            radius.append(clean_date_file_or_dir(config.RADIUS_METRICS_YT_FOLDER,
                                                 vertices_type + postfix,
                                                 debug_days))
    vertices = []
    # vertices: all useful tables are copied in GraphHistorySnapshot so we can cleanup here
    for vertices_folder in vertices_folders:
        vertices.append(clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, vertices_folder, debug_days))

    stuff = fingerprints + radius + vertices + [
        # debug cleanup: keeps only last n days of specified folders to allow debug
        # drop_date_folders(config.CRYPTA_GRAPH_CRYPTA_HOME + '/state/household/logins/login', days),
        # drop_date_folders(config.CRYPTA_GRAPH_CRYPTA_HOME + '/state/household/logins/mailru', days),
        # drop_date_folders(config.CRYPTA_GRAPH_CRYPTA_HOME + '/state/household/logins/vk', days),
        # drop_date_folders(config.CRYPTA_GRAPH_CRYPTA_HOME + '/state/webvisor_date_processed', days),
        # drop_date_folders(config.CRYPTA_GRAPH_CRYPTA_HOME + '/state/household/ip_households', days),
        # drop_date_folders(config.CRYPTA_GRAPH_CRYPTA_HOME + '/state/webvisor_date_processed/stats', days),
        # drop_date_folders(config.CRYPTA_GRAPH_CRYPTA_HOME + '/state/webvisor_processed', days),
        # drop_date_folders(config.CRYPTA_GRAPH_CRYPTA_HOME + '/state/toloka/cross_device', days),
        # drop_date_folders(config.CRYPTA_GRAPH_CRYPTA_HOME + '/state/toloka/indevice_desktop', days),
        # drop_date_folders(config.CRYPTA_GRAPH_CRYPTA_HOME + '/state/toloka/indevice_mobile', days),
        # drop_date_folders(config.CRYPTA_GRAPH_CRYPTA_HOME + '/state/webvisor_processed/stats', days),
        # clean_date_file_or_dir(config.YT_OUTPUT_FOLDER + 'v2/soup/supometriya', 'all.supometr_tmp', tmp_days),
        # clean_date_file_or_dir(config.YT_OUTPUT_FOLDER + 'v2/soup/supometriya', 'day.supometr_tmp', tmp_days),
        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'raw_links/yuid_ip_ts', tmp_days),
        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'raw_links/bad_ua', tmp_days),
        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'raw_links/bad_ua_access', tmp_days),
        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'raw_links/yuid_ip_ts', tmp_days),
        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'raw_links/yuid_dit_cookie', tmp_days),
        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'ip_yuid_stream', tmp_days),
        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'ip_ts_yuid', tmp_days),
        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'yuid_raw/ui_yuid_all', tmp_days),
        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'yuid_apps', tmp_days),

        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'mobile/ip_dev_stream', tmp_days),
        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'mobile/dev_info', debug_days),
        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'mobile/uuid_info', debug_days),
        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'mobile/dev_uuid_indevice_perfect_no_limit', tmp_days),
        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'mobile/uuid_dev_no_limit_extended', tmp_days),
        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'mobile/dev_uuid_indevice_perfect_no_limit_tmp', tmp_days),
        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'mobile/account_manager/account_manager_dict', debug_days),
        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'mobile/app_stats/apps_day', debug_days),

        clean_date_file_or_dir(config.INDEVICE_YT_FOLDER, 'perfect/dev_yuid_indevice_perfect_no_limit', debug_days),
        clean_date_file_or_dir(config.INDEVICE_YT_FOLDER, 'perfect/dev_yuid_indevice_unperfect_no_limit', debug_days),
        clean_date_file_or_dir(config.INDEVICE_YT_FOLDER, 'perfect/devid_raw_month', debug_days),
        clean_date_file_or_dir(config.INDEVICE_YT_FOLDER, 'perfect/devid_yuid_all', debug_days),
        clean_date_file_or_dir(config.INDEVICE_YT_FOLDER, 'perfect/app_stats/month/apps', debug_days),
        clean_date_file_or_dir(config.INDEVICE_YT_FOLDER, 'perfect/app_stats/month/uuid_tmp', debug_days),

        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'stat_new/', debug_days),
        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'upload_bb/', debug_days),
        clean_date_file_or_dir(config.YT_OUTPUT_FOLDER, 'pairs/', debug_days),

        clean_date_file_or_dir(config.IS_OUTPUT_FOLDER, 'mapping_cryptaid_tmp', debug_days),
        clean_date_file_or_dir(config.RADIUS_METRICS_YT_FOLDER, 'yuid_rlogin/yuid_ua_tmp', debug_days),

        # final month cleanup: keeps only last month of main dirs
        clean_whole_dates_in_dir(config.YT_OUTPUT_FOLDER, days),
        clean_whole_dates_in_dir(config.IS_OUTPUT_FOLDER, days),
        clean_whole_dates_in_dir(config.INHOUSEHOLD_YT_MATCH_FOLDER, days),
        clean_whole_dates_in_dir(config.HH_FOLDER + 'graph/', days),
        clean_whole_dates_in_dir(config.HH_FOLDER + 'saved_households/', days),
        clean_whole_dates_in_dir(config.HH_FOLDER2 + 'daily_hh/', days),
        clean_whole_dates_in_dir(config.RADIUS_LOG_YT_FOLDER, days),
        clean_whole_dates_in_dir(config.RADIUS_METRICS_YT_FOLDER, days),
        clean_whole_dates_in_dir(config.AUDIENCE_YT_FOLDER, days),
        clean_whole_dates_in_dir(config.INDEVICE_YT_FOLDER, days),

        clean_whole_dates_in_dir(soup_config.SOUP_DAY_DIR, debug_days),
        clean_whole_dates_in_dir(soup_config.SOUP_DIR + 'supometriya/', debug_days),
        clean_whole_dates_in_dir(soup_config.SOUP_DAY_LOGS_DIR, debug_days),

    ]

    to_remove = stuff
    has_lists = True
    # Multi-level flatten of 'stuff'
    while has_lists:
        new_to_remove = []
        has_lists = False
        for t in to_remove:
            if isinstance(t, (list, tuple, set, types.GeneratorType)):
                new_to_remove.extend(t)
                has_lists = True
            else:
                new_to_remove.append(t)
        to_remove = new_to_remove

    if safe_mode:
        print 'Going to remove all these tables/directories:'
        to_remove = [t for t in to_remove if yt.exists(t[:-1] if t.endswith('/') else t)]
        for t in to_remove:
            tt = t
            if tt.endswith('/'):
                tt = tt[:-1]

            if yt.exists(tt):
                print '\t%s' % t
        print 'If you really want it, type "%s"' % POWER_LINE
        sys.stdout.flush()
        line = sys.stdin.readline()
        line = line.strip()
        if line == POWER_LINE:
            print 'You asked for it!'
            remove_all(to_remove)
        else:
            print 'LOL, no'
    else:
        remove_all(to_remove)


def run_postproc_and_merge():
    run_postproc(safe_mode=False)
    merge_graph_tables.merge_all_tables()


if '__main__' == __name__:
    yt.config.set_proxy(config.MR_SERVER)

    logging.basicConfig(level='INFO')

    run_postproc(safe_mode=True)
    merge_graph_tables.merge_all_tables()
