#!/usr/bin/python
# -*- coding: utf-8 -*-

#  "" 

import math
import yt.wrapper as yt


CTGS_NUM = 13221 # ,     99.96%

def compr(key, recs): # 
    yield recs.next()


def tf_idf_calc(key, recs): # idf
    L = list(recs)
    idf = math.log(float(CTGS_NUM) / len(L), 10)
    for rec in L:
        rec['tf_idf'] = '%.3f' % (rec['freq'] * idf)
        del rec['bid']
        yield rec


def main():
    tab1 = '//home/catalogia/users/yuryz/bnrs_wrds_freq'
    tab2 = '//tmp/yuryz/bnrs_wrds_freq_s'
    tab3 = '//tmp/yuryz/bnrs_wrds_freq_c'

    #yt.run_sort(tab1, tab2, sort_by=['word', 'mctgs'])
    #yt.run_reduce(compr, [tab2], [tab3], reduce_by = ['word', 'mctgs'])
    #yt.run_sort(tab3, sort_by=['word', 'mctgs'])

    tab4 = '//tmp/yuryz/bnrs_wrds_freq_idf'

    yt.run_reduce(tf_idf_calc, [tab3], [tab4], reduce_by = ['word', 'mctgs'])
    #yt.run_sort(output, sort_by=['bid'])

    #print yt.row_count(input)
    #print yt.row_count(output)


if __name__ == '__main__':
    main()
