#!/usr/bin/python
# -*- coding: utf-8 -*-

#подготовка данных для взвешивания слов в категориях с помощью TF_IDF

import sys
import re
import yt.wrapper as yt


def word2tf_idf(key, recs):
    word = ''
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0:
            word = rec['word']
        else:
            if word != '':
                yield { "word": key['word'], "mctgs": rec['mctgs'], "tf_idf": rec['tf_idf'] }


def sum_calc(key, recs): #вычисление суммы tf_idf в категориях
    tot = 0
    for rec in recs:
        tot += rec['tf_idf']
    yield { "mctgs": rec['mctgs'], "sum": -tot }


def main():
    tab1 = '//tmp/yuryz/tf_idf_core'
    tab2 = '//home/catalogia/users/yuryz/tmp/wrd_ctg_tf_idf'
    tab3 = '//tmp/yuryz/wrd_core_tf_idf'

    #yt.run_reduce(word2tf_idf, [tab1, tab2], tab3, reduce_by = ['word'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab3, sort_by=['mctgs', 'word'])

    tab4 = '//tmp/yuryz/wrd_core_tf_idf_sum'

    yt.run_reduce(sum_calc, tab3, tab4, reduce_by = ['mctgs'])
    yt.run_sort(tab4, sort_by=['sum', 'mctgs'])

    print >> sys.stderr, yt.row_count(tab1)
    print >> sys.stderr, yt.row_count(tab2)
    print >> sys.stderr, yt.row_count(tab3)
    print >> sys.stderr, yt.row_count(tab4)


if __name__ == '__main__':
    main()
