#!/usr/bin/python
# -*- coding: utf-8 -*-

#вычисление распределения размеров кластеров

import yt.wrapper as yt


def reducer1(key, recs):
    count = 0
    for rec in recs:
        count += 1
    if count > 1:
        yield { "size": -count, "comb": key['comb'] }


def reducer2(key, recs):
    freq = 0
    flag = 0
    for rec in recs:
        freq += 1
        if flag == 0:
            flag = 1
            comb = rec['comb'] #пример сигнатуры, порождающей кластер данного размера
    yield { "freq": -freq, "size": key['size'], "comb": comb }


def main():
    tab1 = '//home/catalogia/users/yuryz/virt/virt_pref_comb'
    tab2 = '//home/catalogia/users/yuryz/virt/clast_size1'

    yt.run_reduce(reducer1, [tab1], [tab2], reduce_by = ['comb'])
    yt.run_sort(tab2, tab2, sort_by=['size', 'comb'])

    tab3 = '//home/catalogia/users/yuryz/virt/clast_size2'

    yt.run_reduce(reducer2, [tab2], [tab3], reduce_by = ['size'])
    yt.run_sort(tab3, tab3, sort_by=['freq', 'size'])

    print yt.row_count(tab1)
    print yt.row_count(tab2)
    print yt.row_count(tab3)

    yt.remove(tab2)

if __name__ == '__main__':
    main()
