#!/usr/bin/python
# -*- coding: utf-8 -*-

#выбор размеченных баннеров

import sys
import re
import yt.wrapper as yt


def bid_join(key, recs):
    bid = 0
    bnrs = []
    ctg = ''
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0:
            bid = rec['bid']
            ctg = rec['CategoryNames']
        elif bid != 0:
            bnrs.append(rec)  #у блока м.б. несколько категорий

    #анализируем вхождение размеченной категории в список категорий блока
    mctgs = []
    for bnr in bnrs:
        mctgs.append(bnr['mctgs'])

    ctgs = ctg.split('/') #размеченные категории
    guess = 'NO'
    for ctg in ctgs:
        if ctg in mctgs:
            guess = 'YES'

    if len(bnrs) > 0:
        bnrs[0]['CategoryNames'] = ctg
        bnrs[0]['mctgs'] = '/'.join(mctgs)
        bnrs[0]['guess'] = guess
        #bnrs[0]['size'] = -bnrs[0]['size']
        bnrs[0]['size'] = bnrs[0]['size']
        yield bnrs[0]


def main():
    tab1 = '//home/catalogia/users/yuryz/etalon/marked_dataset_irt_checked'
    tab2 = '//tmp/yuryz/marked_dataset_irt_checked'

    #yt.run_sort(tab1, tab2, sort_by=['bid'])

    tab3 = '//home/catalogia/users/yuryz/etalon/block_sens_num'
    tab4 = '//tmp/yuryz/block_sens_num'

    #yt.run_sort(tab3, tab4, sort_by=['bid'])

    tab5 = '//tmp/yuryz/block_sens_num_checked'

    #yt.run_reduce(bid_join, [tab2, tab4], tab5, reduce_by = ['bid'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab5, sort_by=['mctgs', 'size', 'guess', 'bid'])

    ctgs = {}
    for rec in yt.read_table(tab5, raw=False):
        if rec['mctgs'] not in ctgs:
            ctgs[rec['mctgs']] = {}
        if rec['size'] not in ctgs[rec['mctgs']]:
            ctgs[rec['mctgs']][rec['size']] = [ 0, 0 ] #всего записей; записей с YES
        ctgs[rec['mctgs']][rec['size']][0] += 1
        if rec['guess'] == 'YES': ctgs[rec['mctgs']][rec['size']][1] += 1

    for ctg in ctgs:
        sizes = list(ctgs[ctg].keys())
        sizes = sorted(sizes, reverse=False)

        reliab = {}
        for i in range(len(sizes)):
            total = 0
            guess = 0
            for j in range(i, len(sizes)):
                total += ctgs[ctg][sizes[j]][0]
                guess += ctgs[ctg][sizes[j]][1]
            reliab[sizes[i]] = '%.3f' % (float(guess) / total)

        for size in sizes:
            print ctg + '\t' + str(size) + '\t' + str(ctgs[ctg][size][0]) + '\t' + str(ctgs[ctg][size][1]) + '\t' + str(reliab[size])


if __name__ == '__main__':
    main()
