import collections
import re
import argparse
import yt.wrapper as yt

def traverse_all_tables_in_account(base_path, account):
    for yson_path in yt.search(base_path,
                               node_type=['table', 'map_node'],
                               object_filter=lambda p: p.attributes['account'] == account,
                               attributes=['type', 'resource_usage', 'account']):

        path = str(yson_path)
        node_type = yson_path.attributes['type']

        if node_type == 'map_node':
            print 'Scanning ' + path

        date_in_path_match = re.search(r'\d{4}-\d{2}-\d{2}', path)
        if date_in_path_match:
            date = date_in_path_match.group(0)
            path_no_date = path.replace(date, 'X')
        else:
            date = None
            path_no_date = path

        resource_usage = yson_path.attributes['resource_usage']
        disk_space = resource_usage['disk_space']
        chunk_count = resource_usage['chunk_count']
        node_count = resource_usage['node_count']

        yield {'path': path, 'path_no_date': path_no_date, 'node_type': node_type, 'date': date,
               'disk_space': disk_space, 'chunk_count': chunk_count, 'node_count': node_count}


def human_readable_size(num, suffix='B'):
    for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Y', suffix)


def print_top_resource_tables(recs, resource, resource_units, only_date_tables=False, top_n=50):
    print "Top %d %s wasters:" % (top_n, resource)
    resource_sum_counter = collections.Counter()
    tables_count_counter = collections.Counter()
    for table_info in recs:
        if table_info['node_type'] != 'map_node':
            if only_date_tables and not table_info['date']:
                continue
            path = table_info['path_no_date']
            resource_sum_counter[path] += table_info[resource]
            tables_count_counter[path] += 1
    top_sum = 0
    the_rest_sum = 0
    for idx, (t, resource_sum) in enumerate(resource_sum_counter.most_common()):
        tables_count = tables_count_counter[t]
        if idx < top_n:
            top_sum += resource_sum
            print "{:>3}: {:110}[{}] {}".format(idx, t, tables_count,
                                                human_readable_size(resource_sum, resource_units))
        else:
            the_rest_sum += resource_sum
    t1 = human_readable_size(top_sum + the_rest_sum)
    t2 = human_readable_size(top_sum)
    t3 = human_readable_size(the_rest_sum)
    print "Top total: %s (top: %s / others: %s)" % (t1, t2, t3)


if __name__ == '__main__':

    import sys

    try:
        env = sys.argv[1]
    except IndexError:
        env = 'prod'

    if env == 'prod':
        account = 'crypta-graph'
        base_path = '//home/crypta/production'

    elif env == 'prestable':
        account = 'crypta-graph-testing'
        base_path = '//home/crypta/testing'

    else:
        raise Exception('Only "prod" and "prestable" envs are supported')

    print 'Checking "{}" resources in {}'.format(account, base_path)
    tables_info = list(traverse_all_tables_in_account(base_path, account))

    print 'Account %s:' % account
    print_top_resource_tables(tables_info, resource='disk_space', resource_units='B', top_n=30)
    print_top_resource_tables(tables_info, resource='chunk_count', resource_units='', top_n=10)
    print_top_resource_tables(tables_info, resource='node_count', resource_units='', top_n=10)
