#!/usr/bin/python
# -*- coding: utf-8 -*-

#подготовка моей ручной разметки для добавления к Ксюшиной

import yt.wrapper as yt


class ctg_ids_sel(object):
    def __init__(self, ctgs2ids):
        self.ctgs2ids = ctgs2ids

    def __call__(self, rec):
        ids = []
        for ctg in rec['mctgs'].split('/'):
            if ctg in self.ctgs2ids:
                ids.append(str(self.ctgs2ids[ctg]))
            else:
                ids.append(ctg)
        if len(ids) > 0:
            rec['AutoCategoryIDs'] = ','.join(ids)
        else:
            rec['AutoCategoryIDs'] = ''

        ids = []
        for ctg in rec['CategoryNames'].split('/'):
            if ctg in self.ctgs2ids:
                ids.append(str(self.ctgs2ids[ctg]))
            else:
                ids.append(ctg)
        if len(ids) > 0:
            rec['CategoryIDs'] = ','.join(ids)
        else:
            rec['CategoryIDs'] = ''

        yield rec


def join_be_url(key, recs): #добавляем поля из banners_extended
    bid = 0
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0: #'//home/catalogia/banners_extended'
            bid = rec['bid']
            domain = rec['domain']
            phrases = rec['phrases']
            title_extension = rec['title_extension']
            url = rec['href']
        elif bid != 0: # '//tmp/yuryz/my_markup'
            del rec['block_index']
            del rec['block_mctgs']
            del rec['block_size']
            del rec['row_index']
            del rec['auto_category_ids']

            rec['domain'] = domain
            rec['phrases'] = phrases
            rec['title_extension'] = title_extension
            rec['url'] = url

            yield rec


def main():
    tab1 = '//home/catalogia/users/yuryz/etalon/.quarantine/my_markup'
    tab2 = '//tmp/yuryz/my_markup'

    #yt.run_sort(tab1, tab2, sort_by=['bid'])

    tab3 = '//home/catalogia/banners_extended'
    tab4 = '//tmp/yuryz/my_markup_ext'

    yt.run_reduce(join_be_url, [tab3, tab2], tab4, reduce_by = ['bid'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab4, sort_by=['bid', 'title', 'body', 'mctgs', 'CategoryNames'])

    """
    ctgs2ids = {} #маппинг CategoryName в CategoryID
    for rec in yt.read_table('//home/catalogia/categories_tree', raw=False):
        ctgs2ids[rec['Category']] = rec['DirectID'] #актуальные категории

    #for ctg in ctgs2ids:
    #    print ctg, ctgs2ids[ctg]

    tab1 = '//home/catalogia/users/yuryz/etalon/marked_dataset_irt_checked'
    tab2 = '//tmp/yuryz/marked_dataset_irt_checked_ids'

    #yt.run_map(ctg_ids_sel(ctgs2ids), tab1, tab2)
    #yt.run_sort(tab2, sort_by=['bid'])

    tab3 = '//home/catalogia/banners_extended'
    tab4 = '//home/catalogia/users/yuryz/etalon/marked_dataset_irt_checked_ext'

    #yt.run_reduce(join_be_url, [tab2, tab3], tab4, reduce_by = ['bid'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    yt.run_sort(tab4, sort_by=['bid', 'title', 'body', 'mctgs', 'CategoryNames'])
    """


if __name__ == '__main__':
    main()
