#!/usr/bin/python
# -*- coding: utf-8 -*-

#подсчет частот слов по категориям и числа категорий, содержащих слово

import yt.wrapper as yt


def word_sum(key, recs): #частотный словарь слов по категориям
    mctgs_prev = ''
    for rec in recs:
        if mctgs_prev == '':
            mctgs_prev = rec['mctgs']
            ctg_sum = 1
            ctg_num = 1
        elif mctgs_prev == rec['mctgs']:
            ctg_sum += 1
        else:
            yield { "word": key['word'], "mctgs": mctgs_prev, "ctg_sum": ctg_sum }
            mctgs_prev = rec['mctgs']
            ctg_sum = 1
            ctg_num += 1
    yield { "word": key['word'], "mctgs": mctgs_prev, "ctg_sum": ctg_sum }

    yield { "word": key['word'], "ctg_num": ctg_num, "@table_index": 1 }


def main():
    tab1 = '//home/catalogia/users/yuryz/tmp/wrd2ctg'

    tab2 = '//home/catalogia/users/yuryz/tmp/wrd_sum_ctg'
    tab3 = '//home/catalogia/users/yuryz/tmp/wrd_ctg_num'

    yt.run_reduce(word_sum, tab1, [tab2, tab3], reduce_by = ['word'], format=yt.YsonFormat(control_attributes_mode="row_fields"))

    yt.run_sort(tab2, sort_by=['mctgs', 'word'])
    yt.run_sort(tab3, sort_by=['word'])

    print yt.row_count(tab1)
    print yt.row_count(tab2)
    print yt.row_count(tab3)


if __name__ == '__main__':
    main()
