#!/usr/bin/python
# -*- coding: utf-8 -*-

#формирование ядер по тексту баннера и заданной категории

import math
import yt.wrapper as yt


def get_core4bnr(key, recs): #получение ядра для некатегоризованных баннеров
    bid = 0
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0:
            bid = rec['bid']
            core = rec['clast_phrase']
            trash = rec['trash']
        else:
            if bid != 0:
                rec['core'] = core
                rec['trash'] = trash
                rec['_yql_column_0'] = int(rec['_yql_column_0'])
                yield rec


def main():
    tab1 = '//home/catalogia/users/yuryz/tmp/bnrs_norm_sense_disamb'
    tab2 = '//tmp/yuryz/bnrs_norm_sense_disamb'

    yt.run_sort(tab1, tab2, sort_by=['bid'])

    tab3 = '//tmp/yuryz/bnrs2tab'
    tab4 = '//tmp/yuryz/top10000_not_categ'

    yt.run_reduce(get_core4bnr, [tab2, tab3], tab4, reduce_by = ['bid'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    yt.run_sort(tab4, '//home/catalogia/users/yuryz/top10000_not_categ', sort_by=['_yql_column_0'])

    print yt.row_count(tab1)
    print yt.row_count(tab2)

    print yt.row_count(tab3)
    print yt.row_count(tab4)


if __name__ == '__main__':
    main()
