#!/usr/bin/python
# -*- coding: utf-8 -*-

#формирование данных для обучения и тестирования (добавление CategoryIDs и Url)

import yt.wrapper as yt


class ctg_ids_sel(object):
    def __init__(self, ctgs2ids):
        self.ctgs2ids = ctgs2ids

    def __call__(self, rec):
        ids = []
        for ctg in rec['mctgs'].split('/'):
            if ctg in self.ctgs2ids:
                ids.append(str(self.ctgs2ids[ctg]))
            else:
                ids.append(ctg)
        if len(ids) > 0:
            rec['AutoCategoryIDs'] = ','.join(ids)
        else:
            rec['AutoCategoryIDs'] = ''

        ids = []
        for ctg in rec['CategoryNames'].split('/'):
            if ctg in self.ctgs2ids:
                ids.append(str(self.ctgs2ids[ctg]))
            else:
                ids.append(ctg)
        if len(ids) > 0:
            rec['CategoryIDs'] = ','.join(ids)
        else:
            rec['CategoryIDs'] = ''

        yield rec


def join_be_url(key, recs): #добавляем поле Url из banners_extended
    rec_mark = None
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0:  #'//tmp/yuryz/marked_dataset_irt_checked_ids'
            rec_mark = rec
            rec_mark['url'] = rec_mark['domain']
        elif rec_mark != None:#'//home/catalogia/banners_extended'
            rec_mark['url'] = rec['href']

    if rec_mark != None:
        yield rec_mark


def main():
    ctgs2ids = {} #маппинг CategoryName в CategoryID
    for rec in yt.read_table('//home/catalogia/categories_tree', raw=False):
        ctgs2ids[rec['Category']] = rec['DirectID'] #актуальные категории

    #for ctg in ctgs2ids:
    #    print ctg, ctgs2ids[ctg]

    tab1 = '//home/catalogia/users/yuryz/etalon/marked_dataset_irt_checked'
    tab2 = '//tmp/yuryz/marked_dataset_irt_checked_ids'

    #yt.run_map(ctg_ids_sel(ctgs2ids), tab1, tab2)
    #yt.run_sort(tab2, sort_by=['bid'])

    tab3 = '//home/catalogia/banners_extended'
    tab4 = '//home/catalogia/users/yuryz/etalon/marked_dataset_irt_checked_ext'

    #yt.run_reduce(join_be_url, [tab2, tab3], tab4, reduce_by = ['bid'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    yt.run_sort(tab4, sort_by=['bid', 'title', 'body', 'mctgs', 'CategoryNames'])


if __name__ == '__main__':
    main()
