#!/usr/bin/python
# -*- coding: utf-8 -*-

#синхронизация Юриного распределения и banners_extended

import random
import sys
import re
import yt.wrapper as yt


def add_ctg_name(key, recs): #добавление названий категорий в Юрино распределение
    bid = 0
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0:
            bid = rec['BannerID']
            mctgs = rec['AutoCategoryNames']
            domain = rec['Domain']
        elif bid != 0:
            if rec['shard'] == 0:
                yield { "bid": bid, "title": rec['Title'], "body": rec['Body'], "mctgs": mctgs, "domain": domain, "title_extension": rec['TitleExtension'], "auto_category_ids": rec['AutoCategoryIDs'], "url": rec['Url'] }


def change_mctgs(key, recs): #изменение категорий в Юрином распределении на основе banners_extended
    bid = 0
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0:
            bid = rec['bid']
            title = rec['title']
            body = rec['body']
            mctgs = rec['mctgs']
            auto_category_ids = rec['CategoryIDs']
        elif bid != 0:
            if mctgs == "" or mctgs == rec['mctgs'] or title != rec['title'] and body != rec['body']:
                yield rec #ставляем без изменений
                return

            rec['mctgs'] = mctgs
            rec['auto_category_ids'] = auto_category_ids

            yield rec


def main():
    tab1 = '//home/catalogia/contest/Train'
    tab2 = '//tmp/yuryz/Train'

    yt.run_sort(tab1, tab2, sort_by=['BannerID'])

    tab3 = '//home/broadmatching/users/firefish/ctg/TrainTextShard'
    tab4 = '//tmp/yuryz/TrainTextShard'

    yt.run_sort(tab3, tab4, sort_by=['BannerID'])

    tab5 = '//tmp/yuryz/TrainTextSample'
    tab6 = '//home/catalogia/users/yuryz/etalon/TrainTextSample'

    yt.run_reduce(add_ctg_name, [tab2, tab4], tab5, reduce_by = ['BannerID'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    yt.run_sort(tab5, tab6, sort_by=['bid', 'title', 'body', 'mctgs'])

    tab7 = '//home/catalogia/banners_extended'
    tab8 = '//home/catalogia/users/yuryz/etalon/etalon_new1'

    yt.run_reduce(change_mctgs, [tab7, tab6], tab8, reduce_by = ['bid'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    yt.run_sort(tab8, sort_by=['bid', 'title', 'body', 'mctgs'], job_io={"table_writer":{"max_key_weight":131072}})


if __name__ == '__main__':
    main()
