#!/usr/bin/python
# -*- coding: utf-8 -*-

#оценка качества политематических (содержащих несколько категорий) кластеров

import yt.wrapper as yt

RANK_MIN = 0.97

def reducer(key, recs):
    L = list(recs)
    sum = 0
    for rec in L:
        sum += rec['size']
    for rec in L:
        rank = '%.3f' % (-float(rec['size']) / sum)
        rank = float(rank)
        if rank <= -RANK_MIN:
            rec['rank'] = rank
            yield rec


def main():
    tab1 = '//home/catalogia/users/yuryz/virt/clast_ctgs_poli'
    tab2 = '//home/catalogia/users/yuryz/virt/clast_poli_rank'

    yt.run_reduce(reducer, [tab1], [tab2], reduce_by = ['comb'])
    #yt.run_sort(tab2, tab2, sort_by=['comb', 'rank', 'mctgs', 'size'])
    yt.run_sort(tab2, tab2, sort_by=['mctgs', 'comb', 'rank', 'size'])

    print yt.row_count(tab1)
    print yt.row_count(tab2)


if __name__ == '__main__':
    main()
