#!/usr/bin/python
# -*- coding: utf-8 -*-

#вычисление ранга "эталонности" баннеров

import math
import yt.wrapper as yt


def reducer(key, recs):
    L = []
    for rec in recs:
        L.append(rec['freq'])
    fsum = sum(L)
    size = len(L)

    fact1 = float(fsum) / size #средняя плотность слова в баннере
    fact2 = math.log(size + 1, 2) #длина баннера
    fact3 = 0; #энтропия частот
    for freq in L:
        prob = float(freq) / fsum
        fact3 -= prob * math.log(prob, 2)

    rank = fact1 * fact2 * fact3
    if rank > 0:
        yield { "bid": key['bid'], "rank": '%.2f' % rank }


def main():
    input = '//home/catalogia/users/yuryz/etalon/bnrs_wrds_freq'
    output = '//home/catalogia/users/yuryz/etalon/bnrs_rank'

    yt.run_reduce(reducer, [input], [output], reduce_by = ['bid'])
    yt.run_sort(output, sort_by=['bid'])

    print yt.row_count(input)
    print yt.row_count(output)


if __name__ == '__main__':
    main()
