#!/usr/bin/python
# -*- coding: utf-8 -*-

#выбор размеченных баннеров

import sys
import re
import yt.wrapper as yt


def bid_join(key, recs):
    bid = 0
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0:
            bid = rec['bid']
            ctg = rec['CategoryNames']
        elif bid != 0:
            rec['CategoryNames'] = ctg
            ctgs = ctg.split('/')
            if rec['mctgs'] in ctgs:
                rec['checked'] = 'YES'
            else:
                rec['checked'] = 'NO'
            yield rec


def main():
    tab1 = '//home/catalogia/users/yuryz/etalon/marked_dataset_irt_checked'
    tab2 = '//tmp/yuryz/marked_dataset_irt_checked'

    #yt.run_sort(tab1, tab2, sort_by=['bid'])

    tab3 = '//tmp/yuryz/bnrs_etalon' #select_etalon.py

    tab4 = '//tmp/yuryz/bnrs_etalon_checked'

    yt.run_reduce(bid_join, [tab2, tab3], tab4, reduce_by = ['bid'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab5, sort_by=['size', 'CategoryNames'])

    print yt.row_count(tab4)
    yes = 0
    for rec in yt.read_table(tab4, raw=False):
        if rec['checked'] == 'YES': yes += 1
    print yes


if __name__ == '__main__':
    main()
