#!/usr/bin/python
# -*- coding: utf-8 -*-

#распределение категорий по кластерам

import yt.wrapper as yt


def reducer1(key, recs):
    D = {}
    for rec in recs:
        if D.has_key(rec['mctgs']) is True:
            D[rec['mctgs']] += 1
        else:
            D[rec['mctgs']] = 1

    for mctgs, size in D.items():
        yield { "comb": key['comb'], "mctgs": mctgs, "size": -size }


def reducer2(key, recs):
    L = list(recs)
    if len(L) > 1:
        for rec in L:
            rec['@table_index'] = 0
            yield rec
    elif L[0]['size'] != -1:
        L[0]['@table_index'] = 1
        yield L[0]


def main():
    tab1 = '//home/catalogia/users/yuryz/virt/virt_pref_comb'
    tab2 = '//home/catalogia/users/yuryz/virt/clast_ctgs'

    yt.run_reduce(reducer1, [tab1], [tab2], reduce_by = ['comb'])
    yt.run_sort(tab2, tab2, sort_by=['comb', 'size', 'mctgs'])

    tab3 = '//home/catalogia/users/yuryz/virt/clast_ctgs_poli'
    tab4 = '//home/catalogia/users/yuryz/virt/clast_ctgs_mono'

    yt.run_reduce(reducer2, [tab2], [tab3, tab4], reduce_by = ['comb'])
    yt.run_sort(tab3, tab3, sort_by=['comb', 'size', 'mctgs'])
    yt.run_sort(tab4, tab4, sort_by=['mctgs', 'size', 'comb'])

    print yt.row_count(tab1)
    print yt.row_count(tab2)

    print yt.row_count(tab3)
    print yt.row_count(tab4)


if __name__ == '__main__':
    main()
