#!/usr/bin/python
# -*- coding: utf-8 -*-

#подсчет частот слов по категориям и числа категорий, содержащих слово, а так же вычисление TF_IDF

import math
import yt.wrapper as yt


def word_sum(key, recs): #частотный словарь слов по категориям
    mctgs_prev = ''
    for rec in recs:
        if mctgs_prev == '':
            mctgs_prev = rec['mctgs']
            ctg_sum = 1
            ctg_num = 1
        elif mctgs_prev == rec['mctgs']:
            ctg_sum += 1
        else:
            yield { "word": key['word'], "mctgs": mctgs_prev, "ctg_sum": ctg_sum }
            mctgs_prev = rec['mctgs']
            ctg_sum = 1
            ctg_num += 1
    yield { "word": key['word'], "mctgs": mctgs_prev, "ctg_sum": ctg_sum }

    yield { "word": key['word'], "ctg_num": ctg_num, "@table_index": 1 }


def sum_calc(key, recs): #вычисление суммы частот слов в категориях
    tot = 0
    for rec in recs:
        tot += rec['ctg_sum']
    yield { "mctgs": rec['mctgs'], "sum_total": tot }


def tf_calc(key, recs): #вычисление TF
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0:
            tot = rec['sum_total']
        else:
            tf = float(rec['ctg_sum']) / tot
            rec['tot_sum'] = tot
            rec['tf'] = tf
            yield rec


class tf_idf_calc(object): #вычисление TF_IDF
    def __init__(self, ctg_count):
        self.ctg_count = ctg_count #D - число документов в коллекции

    def __call__(self, key, recs):
        for rec in recs:
            table_index = rec.pop('@table_index')
            if table_index == 0:
                ctg_num = rec['ctg_num']
            else:
                rec['ctg_count'] = self.ctg_count
                rec['ctg_num'] = ctg_num
                rec['idf'] = math.log(float(rec['ctg_count']) / ctg_num, 2)
                rec['tf_idf'] = rec['tf'] * rec['idf']
                yield rec


def main():
    #--- 1. Частотный словарь слов по категориям ---
    tab1 = '//home/catalogia/users/yuryz/tmp/wrd2ctg'
    tab2 = '//home/catalogia/users/yuryz/tmp/wrd_sum_ctg'
    tab3 = '//home/catalogia/users/yuryz/tmp/wrd_ctg_num'

    yt.run_reduce(word_sum, tab1, [tab2, tab3], reduce_by = ['word'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    yt.run_sort(tab2, sort_by=['mctgs', 'word'])
    yt.run_sort(tab3, sort_by=['word'])

    #--- 2. Вычисление суммы частот слов в категориях ---
    tab4 = '//home/catalogia/users/yuryz/tmp/ctg_sum_total'

    yt.run_reduce(sum_calc, tab2, tab4, reduce_by = ['mctgs'])
    yt.run_sort(tab4, sort_by=['mctgs'])

    #--- 3. Вычисление TF ---
    tab5 = '//home/catalogia/users/yuryz/tmp/wrd_ctg_tf'

    yt.run_reduce(tf_calc, [tab4, tab2], tab5, reduce_by = ['mctgs'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    yt.run_sort(tab5, sort_by=['word'])

    #--- 4. Вычисление TF_IDF ---
    tab6 = '//home/catalogia/users/yuryz/tmp/wrd_ctg_tf_idf'

    ctg_count = yt.row_count('//home/catalogia/users/yuryz/tmp/ctg_count') #см. tf_idf_1.py
    yt.run_reduce(tf_idf_calc(ctg_count), [tab3, tab5], tab6, reduce_by = ['word'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    yt.run_sort(tab6, sort_by=['word', 'mctgs'])

    print yt.row_count(tab1)
    print yt.row_count(tab2)
    print yt.row_count(tab3)

    print yt.row_count(tab4)
    print yt.row_count(tab5)
    print yt.row_count(tab6)


if __name__ == '__main__':
    main()
