#!/usr/bin/python
# -*- coding: utf-8 -*-

#подготовка Юриного шарда и расширенного TrainExact к обучению

import random
import sys
import re
import yt.wrapper as yt


def sel_shard(rec): #выбор шарда
    if rec['shard'] == 0: #шард #1
    #if rec['shard'] < 2: #шарды ##1,2
        rec['bid'] = rec['BannerID']
        yield rec


def change_ctg_ids(key, recs): #корректировка поля AutoCategoryIDs в шарде на основе banners_extended
    bid = 0
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0: #//home/catalogia/banners_extended
            if rec['mctgs'] and rec['mctgs'] != "":
                bid = rec['bid']
                ctg_names = rec['mctgs'].split('/')
                ctg_ids = rec['CategoryIDs'].split(',') #в banners_extended списки rec['CategoryIDs'] часто длиннее (справа) списков rec['mctgs']
                auto_category_ids = []
                for i in range(len(ctg_names)):
                    auto_category_ids.append(ctg_ids[i])
        else: #//tmp/yuryz/TrainTextShard
            if bid != 0:
                rec['AutoCategoryIDs'] = ','.join(auto_category_ids)
            del rec['bid']
            yield rec


class rules_applic(object): #применение правил для категорий
    def __init__(self, rules4ctgs):
        self.rules4ctgs = rules4ctgs

    def __call__(self, rec):
        ids = rec['AutoCategoryIDs'].split(',')
        auto_ids = ','.join(sorted(ids))
        if auto_ids in self.rules4ctgs:
            auto_ids = self.rules4ctgs[auto_ids]
        rec['AutoCategoryIDs'] = auto_ids
        yield rec


class convert(object): #конвертирование '//home/catalogia/users/yuryz/etalon/etalon_neighbor_add' к формату TrainExact
    def __init__(self, ctgs2ids):
        self.ctgs2ids = ctgs2ids

    def __call__(self, rec):
        ctg_names = rec['mctgs'].split('/')
        ids = []
        for ctg_name in ctg_names:
            if ctg_name in self.ctgs2ids:
                ids.append(str(self.ctgs2ids[ctg_name]))
            else:
                ids.append(ctg_name)
        ctg_ids = ','.join(ids)

        yield { "BannerID": rec['bid'], "Title": rec['title'], "Body": rec['body'], "Url": rec['url'], "CategoryNames": rec['mctgs'], "CategoryIDs": ctg_ids }


def main():
    ctgs2ids = {} #маппинг CategoryName в CategoryID
    for rec in yt.read_table('//home/catalogia/categories_tree', raw=False):
        ctgs2ids[rec['Category']] = rec['DirectID']

    rules4ctgs = {} #статистические правила корректировки категорий
    for rec in yt.read_table('//home/catalogia/users/yuryz/etalon/rules4ctgs', raw=False):
        ctg_names = rec['mctgs'].split('/')
        ctg_ids = []
        for ctg_name in ctg_names: #name => id
            if ctg_name in ctgs2ids:
                ctg_ids.append(str(ctgs2ids[ctg_name]))
            else:
                ctg_ids.append(ctg_name)
        ids_in = ','.join(sorted(ctg_ids))

        ctg_names = rec['CategoryNames'].split('/')
        ctg_ids = []
        for ctg_name in ctg_names:
            if ctg_name in ctgs2ids:
                ctg_ids.append(str(ctgs2ids[ctg_name]))
            else:
                ctg_ids.append(ctg_name)
        ids_out = ','.join(sorted(ctg_ids))

        rules4ctgs[ids_in] = ids_out

    #--- 1. Подготовка Юриного шарда ---
    tab1 = '//home/broadmatching/users/firefish/ctg/TrainTextShard'
    tab2 = '//tmp/yuryz/TrainTextShard'

    #yt.run_map(sel_shard, tab1, tab2, format=yt.YsonFormat(control_attributes_mode="row_fields"))
    #yt.run_sort(tab2, sort_by=['bid'])

    tab3 = '//home/catalogia/banners_extended'
    tab4 = '//tmp/yuryz/TrainTextShard_upd'

    #yt.run_reduce(change_ctg_ids, [tab3, tab2], tab4, reduce_by = ['bid'], format=yt.YsonFormat(control_attributes_mode="row_fields"))

    tab5 = '//home/catalogia/users/yuryz/multik/TrainTextShard_upd'

    yt.run_map(rules_applic(rules4ctgs), tab4, tab5, format=yt.YsonFormat(control_attributes_mode="row_fields"))

    #--- 2. Подготовка расширенного TrainExact (см. также etalon/select_bad_ctgs.py) ---
    tab6 = '//home/catalogia/users/yuryz/etalon/etalon_neighbor_add'
    tab7 = '//home/catalogia/users/yuryz/multik/TrainExact_ext'

    #yt.run_map(convert(ctgs2ids), tab6, tab7, format=yt.YsonFormat(control_attributes_mode="row_fields"))


if __name__ == '__main__':
    main()
